In [None]:
'''
Importing necessary packages:
'''
from nltk import sent_tokenize, pos_tag
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.util import mark_negation
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download("opinion_lexicon")
from string import punctuation
from IPython.display import display
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ndjson
from sklearn.metrics import confusion_matrix

'''
Increase the maximum columns and maximum column width to display in pandas DataFrame:
'''
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


'''
Function to convert part of speech tag to WordNet tag:
'''
def convert_to_wordnet(tag):
    if not isinstance(tag, str):
        raise ValueError("The tag must be a string.")
    
    first_char = tag[0].upper()
    
    if first_char == 'J':
        return wn.ADJ
    elif first_char == 'N':
        return wn.NOUN
    elif first_char == 'R':
        return wn.ADV
    elif first_char == 'V':
        return wn.VERB
    
    raise ValueError("Invalid tag provided.")

    
'''
Function to calculate sentiment score using SentiWordNet:
'''
def getSentimentScore(text):
    raw_sentences = sent_tokenize(text)
    
    total_score = sum([
        sum([
            swn.senti_synset(synset.name()).pos_score() - swn.senti_synset(synset.name()).neg_score()
            for synset in wn.synsets(WordNetLemmatizer().lemmatize(word, pos=convert_to_wordnet(tag)), pos=convert_to_wordnet(tag))
            if synset
        ])
        / len(tokens)
        for sentence in raw_sentences
        for tokens in [TreebankWordTokenizer().tokenize(sentence)]
        for word, tag in pos_tag(tokens)
        if convert_to_wordnet(tag)
    ])

    return (total_score / len(raw_sentences)) * 100

'''
Function to calculate sentiment score using Opinion Lexicon from NLTK:
'''
def getSentimentScoreOplex(text):
    """
    This method returns the sentiment score of a given text using NLTK opinion lexicon.
    input: text
    output: numeric (double) score, >0 means positive sentiment and <0 means negative sentiment.
    """    
    total_score = 0
    pos_words = set(opinion_lexicon.positive())
    neg_words = set(opinion_lexicon.negative())

    raw_sentences = sent_tokenize(text)
    
    for sentence in raw_sentences:
        sent_score = 0
        sentence = sentence.replace("<br />"," ")
        sentence = sentence.translate(str.maketrans('', '', punctuation)).lower()
        tokens = word_tokenize(sentence)

        for token in tokens:
            sent_score += 1 if token in pos_words else (-1 if token in neg_words else 0)

        total_score += sent_score / len(tokens)

    return total_score
    

with open('data/amazon_ratings/Movies_and_TV.json') as f:
    data = ndjson.load(f)


# Load data from json file
with open('data/amazon_ratings/Movies_and_TV.json') as f:
    data = json.load(f)


# Create a DataFrame from the loaded data
reviews_df = pd.DataFrame(data)


# Sample 50000 random rows from the DataFrame
reviews = reviews_df.sample(n=50000, random_state=42)


# Drop rows with missing values in the 'reviewText' column
reviews.dropna(subset=['reviewText'], inplace=True)

# Get SWN sentiment scores for each review text
def getSentimentScore(text):
    # Implement your code to get sentiment score
    return sentiment_score

reviews['swn_score'] = reviews['reviewText'].apply(lambda text: getSentimentScore(text))


# Categorize SWN sentiment scores into positive, negative, or neutral
reviews['swn_sentiment'] = reviews['swn_score'].apply(lambda x: "positive" if x > 1 else ("negative" if x < 0.5 else "neutral"))

#Categorize true sentiment based on 'overall' rating
reviews['true_sentiment'] = \
    reviews['overall'].apply(lambda x: "positive" if x >= 4 else ("neutral" if x == 3 else "negative"))


# Create y_swn_pred and y_true as lists
y_swn_pred = reviews['swn_sentiment'].tolist()
y_true = reviews['true_sentiment'].tolist()


# Convert opinion lexicon to lists of positive and negative words
pos_words = list(opinion_lexicon.positive())
neg_words = list(opinion_lexicon.negative())


# Get opinion lexicon sentiment scores for each review text
def getSentimentScoreOplex(x):
    # Implement your code to get sentiment score using opinion lexicon
    return sentiment_score
reviews['oplex_sentiment_score'] = reviews['reviewText'].apply(lambda x: getSentimentScoreOplex(x))


# Categorize opinion lexicon sentiment scores into positive, negative, or neutral
reviews['oplex_sentiment'] = \
    reviews['oplex_sentiment_score'].apply(lambda x: "positive" if x > 0.1 else ("negative" if x < 0 else "neutral"))


# Count the values of 'oplex_sentiment' column
reviews['oplex_sentiment'].value_counts(dropna=False)


# Create y_oplex_pred as a list of 'oplex_sentiment' values
y_oplex_pred = reviews['oplex_sentiment'].tolist()


# Create confusion matrix for 'y_true' and 'y_oplex_pred'
oplex_cm = confusion_matrix(y_true, y_oplex_pred)


# Create a heatmap of the confusion matrix
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
sns.heatmap(oplex_cm, cmap='viridis_r', annot=True, fmt='d', square=True, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')