In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from afinn import Afinn
afinn = Afinn(language = "en")

In [2]:
# Loads in negative and positive wordlists aswell as dataset 
negative = pd.read_csv("negative_words.csv", header=None)
positive = pd.read_csv("positive_words.csv", header=None)
df = pd.read_csv("testset.csv", header=None)


In [3]:
# Classify_tweet takes in a list of tweets. For word in tweet, we check for negative or positive valuation (weighted with an integer 1-5 with afinn.score() fucntion) 
# Then we calculate a combined score for every tweet.
# Function returns either 1 for positive or 0 for negaitve.
def classify_tweet(tweet):
    score = 1
    for word in tweet.split():
        score += afinn.score(word)
    if score > 2:
        return 1
    return 0

In [4]:
# Eval_model() takes in labels and predictions. Labels contain the true labels for every tweet. The predictions list contaions labels form the classify_tweet() fucntion.
# The fuction then uses the accuracy_score and f1_score function and prints out the calculated values.
def eval_model(labels, predictions):
    print(f"accuracy: {accuracy_score(labels, predictions)}")
    print(f"f1 score: {f1_score(labels, predictions)}")

In [15]:
#df = data.sample(frac=0.075)
# Tweets defined as colum 5 in df. Labels defined as colum 1 in df
tweets = (df.iloc[:, 5])
labels = (df.iloc[:, 0]).values
# In df positive tweets in labeled with the integer 4. We change this to 1.
labels = np.where(labels > 1, 1, labels)
print(labels)

[0 1 1 ... 1 0 1]


In [16]:
# Calculate predictions
predictions = np.array([classify_tweet(tweet) for tweet in tweets.values])
print(predictions)

[0 1 0 ... 0 0 1]


In [17]:
# Prints out the normalized fraction of predicted and true labeled, positive and negative tweets. 
print(f"1's: {len(predictions[predictions == 1]) / len(predictions)}")
print(f"0's: {len(predictions[predictions == 0]) / len(predictions)}")
print(f"1's: {len(labels[labels == 1]) / len(labels)}")
print(f"0's: {len(labels[labels == 0]) / len(labels)}")

1's: 0.35686666666666667
0's: 0.6431333333333333
1's: 0.5017916666666666
0's: 0.4982083333333333


In [18]:
# Runs eval_model() to obtain accuracy and f1 score
eval_model(labels, predictions)

accuracy: 0.6441083333333333
f1 score: 0.5855258688457768
