In [None]:
import json
import re

# Load JSON data
data = []
with open("/content/Cell_Phones_and_Accessories_5.json", 'r') as f:
    for line in f:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue  # Skip lines with JSON decoding errors

# Explore dataset
print("Number of reviews:", len(data))
print("Keys in each review:", data[0].keys())


# Filter dataset
filtered_data = [{k: v for k, v in review.items() if k in ['reviewText', 'overall']} for review in data]

# Text preprocessing
stop_words = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
    'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where',
    'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
}

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

for review in filtered_data:
    review['reviewText'] = preprocess_text(review['reviewText'])

# Thematic Analysis
positive_words = {'good', 'great', 'excellent', 'awesome', 'amazing'}
negative_words = {'bad', 'poor', 'terrible', 'awful', 'horrible'}

positive_words_freq = {}
negative_words_freq = {}

for review in filtered_data:
    words = review['reviewText'].split()
    for word in words:
        if word in positive_words:
            positive_words_freq[word] = positive_words_freq.get(word, 0) + 1
        elif word in negative_words:
            negative_words_freq[word] = negative_words_freq.get(word, 0) + 1

# Determine key phrases
positive_key_phrases = [word for word, freq in positive_words_freq.items() if freq > 2]  # Adjust frequency threshold as needed
negative_key_phrases = [word for word, freq in negative_words_freq.items() if freq > 2]  # Adjust frequency threshold as needed

# Print key phrases
print("Positive Key Phrases:", positive_key_phrases)
print("Negative Key Phrases:", negative_key_phrases)

# Sentiment Analysis
def get_sentiment_score(text):
    words = text.split()
    positive_score = sum(1 for word in words if word in positive_key_phrases)
    negative_score = sum(1 for word in words if word in negative_key_phrases)
    return positive_score - negative_score

for review in filtered_data:
    review['sentiment'] = 'positive' if get_sentiment_score(review['reviewText']) > 0 else 'negative' if get_sentiment_score(review['reviewText']) < 0 else 'neutral'

# Storage
with open("C:\\Users\\hp\\OneDrive\\Desktop\\reviews\\sentiment_file.txt", 'w') as f:
    for review in filtered_data:
        f.write(f"Review: {review['reviewText']} | Sentiment: {review['sentiment']}\n")
