In [1]:
import sys
sys.path.append('src')

from sklearn.metrics import classification_report
import pandas as pd

from src.sentiment_classifier import SentimentClassifier


def load_dataset(dataset_path):
    df = pd.read_csv(dataset_path)
    return df[df['Sentiment'] != 'irrelevant'].reset_index(drop=True)


train_set_path="data/Train.csv"
test_set_path="data/Test.csv"
normalization_lexicon_path="data/normalization-lexicon/emnlp_dict.txt"

train = load_dataset(train_set_path)
test = load_dataset(test_set_path)

In [2]:
clf = SentimentClassifier(normalization_lexicon_path, start_day_hour=10)
clf.fit(train)

In [3]:
pred_sent, pred_org = clf.predict_sentiment(test), clf.predict_organization(test)
print('=========== Sentiment prediction report ===========')
print(classification_report(test['Sentiment'], pred_sent))
print('=========== Organization prediction report ===========')
print(classification_report(test['Topic'], pred_org))

              precision    recall  f1-score   support

    negative       1.00      0.39      0.56        49
     neutral       0.74      0.99      0.85       156
    positive       0.80      0.25      0.38        32

    accuracy                           0.76       237
   macro avg       0.85      0.54      0.60       237
weighted avg       0.80      0.76      0.72       237

              precision    recall  f1-score   support

       apple       0.66      0.98      0.79        93
      google       0.97      0.58      0.73        48
   microsoft       0.72      0.60      0.66        48
     twitter       0.81      0.52      0.63        48

    accuracy                           0.73       237
   macro avg       0.79      0.67      0.70       237
weighted avg       0.77      0.73      0.72       237



In [26]:
correct_idx = []
for idx, row in test.iterrows():
    if row['Sentiment'] == pred_sent[idx]:
        correct_idx.append(idx)
correct_test = test.iloc[correct_idx]
correct_test = correct_test.sample(10).reset_index(drop=True)
rates = clf.predict_sentiment(correct_test, get_rate=True)
for idx, row in correct_test.sample(10).iterrows():
    print('%s: %.2f' % (row['Sentiment'], rates[idx]))

neutral: 3.02
neutral: 2.89
negative: 1.18
neutral: 3.01
positive: 4.94
neutral: 2.26
negative: 1.16
neutral: 3.05
neutral: 2.45
neutral: 2.65
