In [61]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

In [62]:
def load_data(page_numbers: list[int]):
    pages = [pd.read_json(f"data/pages_with_sentiment/page{page}.json") 
             for page in page_numbers]

    return pd.concat(pages)

In [63]:
sia = SentimentIntensityAnalyzer()

def predict_sentiment(text: str):
    sentences = tokenize.sent_tokenize(text)
    sentence_sentiments = [sia.polarity_scores(sentence)["compound"] for sentence in sentences]

    compound_score = sum(sentence_sentiments) / len(sentence_sentiments)

    if compound_score >= 0:
        return "Positive"
    else:
        return "Negative"

In [64]:
def calculate_prediction_accuracy(data: pd.DataFrame):
    num_correct = len(data[data["sentiment"] == data["predicted_sentiment"]])
    return num_correct / len(data)

### Results

In [65]:
data = load_data([1, 2, 3, 4, 5, 6])
print("Loaded", len(data), "statements")

data["predicted_sentiment"] = data["text"].apply(predict_sentiment)

accuracy = calculate_prediction_accuracy(data)

print("The sentiment analyzer was able to predict with accuracy of {:.2f}".format(accuracy))

Loaded 88 statements
The sentiment analyzer was able to predict with accuracy of 0.60


### Breakdown

In [69]:
num_positive = len(data[data["sentiment"] == "Positive"])
num_positive_correct = len(data[(data["sentiment"] == "Positive") & (data["predicted_sentiment"] == "Positive")])
num_positive_incorrect = num_positive - num_positive_correct

num_negative = len(data[data["sentiment"] == "Negative"])
num_negative_correct = len(data[(data["sentiment"] == "Negative") & (data["predicted_sentiment"] == "Negative")])
num_negative_incorrect = num_negative - num_negative_correct

print("\t\t\t     Actual values")
print("\t\t\t  Positive    Negative")
print("Predicted\nvalues")
print("\t\t Positive  ", num_positive_correct, "\t\t", num_positive_incorrect)
print("\t\t Negative  ", num_negative_incorrect, "\t\t", num_negative_correct)

			     Actual values
			  Positive    Negative
Predicted
values
		 Positive   36 		 4
		 Negative   27 		 17
