In [15]:
import numpy as np
import pandas as pd

from nltk.tokenize.toktok import ToktokTokenizer
from gensim.models import word2vec

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn import tree

import graphviz

from datetime import datetime
import time
from collections import Counter

import tba3102
import model_evaluation_utils as meu
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import unsupervised_models


In [16]:
print('Text processing started at {}'.format(datetime.now()))

Text processing started at 2025-04-16 21:03:13.675326


In [17]:
# file_name = "cleaned-galaxy"
file_name = "labelled_dataset_apple_review"
# file_name = "cleaned-samsung_vs_pixel"
df = pd.read_csv(f'../data/{file_name}.csv')
df['Cleaned_Comment'].replace(r'^(\s)+$', np.nan, regex=True, inplace=True)
df = df.dropna(subset=['Cleaned_Comment']).reset_index(drop=True)
df = df[df["Cleaned_Comment"].apply(lambda x: len(x) > 2)]
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cleaned_Comment'].replace(r'^(\s)+$', np.nan, regex=True, inplace=True)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Comment ID,Comment Body,Author,Upvotes,Timestamp,Cleaned_Comment,ClusterLabel,sentiment_polarity,sentiments,actual_sentiments
0,11,74,lnuzh3l,It does not look choppy.,Deleted,-1,1726728178,look happy,5,0.8,positive,positive
1,48,68,lntlqh6,Well I’m coming from a 12 mini an upgrade to a...,Niightstalker,5,1726705158,well come mind upgrade never phone feel like h...,12,0.8,positive,negative
2,13,34,lnulyd1,Nah they will do 75hz so strikes a good balanc...,aspenextreme03,5,1726720205,ah strike good balance year,6,0.7,positive,positive
3,22,126,lnxsjys,Someone's not a fanatic for telling you they t...,FlarblesGarbles,1,1726771385,someone fantastic tell think happy,7,0.6,positive,negative
4,55,206,lntg62d,Literally any time you swipe or scroll or any ...,TwoMoreMinutes,9,1726703069,literally time wipe stroll kind animation yea ...,14,0.6,positive,negative
5,51,245,lnv8d40,So you're fine with a $800 iPhone having a low...,Shaykea,1,1726734380,fine phone lower refresh rate andros good reas...,12,0.558333,positive,negative
6,33,89,lnv40ts,"When you adjust for inflation, basically any c...",Forte69,1,1726731259,adjust inflation basically computing device in...,10,0.55,positive,positive
7,0,0,lnt9dc3,You only need one guess to know the theme of t...,Portatort,263,1726700551,need one guess know theme top comment post,0,0.5,positive,positive
8,15,78,lnu6g18,"> usb2\n\nI agree, and yet realistically how m...",SkyGuy182,0,1726713084,I agree yet realistically many people day unlo...,6,0.5,positive,negative
9,23,150,lnwfg11,"""Good thing I have this 120hz display so i can...",bran_the_man93,3,1726755582,good thing I display I watch netflix fas sort ...,7,0.5,positive,negative


In [24]:
test_corpus = df["Cleaned_Comment"].to_list()
test_label = df["actual_sentiments"].to_list()

In [25]:
predicted_sentiments = [unsupervised_models.analyze_sentiment_vader_lexicon(review, threshold=0.4, verbose=False) for review in test_corpus]
predicted_sentiments[:10]

['positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive']

In [26]:
def classify(predictions, test_label, result_row, model_name):
    
    acc = accuracy_score(test_label, predictions)
    print('Testing Accuracy:', acc)
    f1 = f1_score(test_label, predictions, average="macro")
    print('f1_score:', f1)
    result_row[f'Testing score'] = round(acc, 3)
    result_row[f"f1_score"] = round(f1, 3)
    # print(result_row)
    
    unique_classes = list(set(test_label))    
    
    meu.get_metrics(true_labels=test_label, predicted_labels=predictions)    
    
    meu.display_classification_report(true_labels=test_label, predicted_labels=predictions, classes=unique_classes)
    
    print(metrics.confusion_matrix(y_true=test_label, y_pred=predictions, labels=unique_classes))
    

In [27]:
unsupervised_learning_models = {'Textblob': unsupervised_models.analyze_sentiment_vader_lexicon,
                              'Afinn': unsupervised_models.analyze_sentiment_afinn,
                              'Sentiword': unsupervised_models.analyze_sentiment_sentiwordnet_lexicon,
                              'Vader': unsupervised_models.analyze_sentiment_vader_lexicon}

In [28]:
columns = ["Unsupervised Model", "Testing score", "f1_score"]
    
print(columns)

df_results = pd.DataFrame(columns=columns)
# df_results.astype({'Unsupervised Model':'string'}, copy=False)

# tba3102.set_default_pandas_options(max_colwidth=200, max_columns=len(unsupervised_learning_models)*4, width=len(unsupervised_learning_models)*1000)

['Unsupervised Model', 'Testing score', 'f1_score']


In [29]:
for name, model in unsupervised_learning_models.items():        
    
    print('*** {} ***'.format(name))
    
    result_row = {"Unsupervised Model": name}
    
    predicted_sentiments = [model(review) for review in test_corpus]     
    classify(predicted_sentiments, test_label, result_row, name)             

    print('_'*120)
    print('\n\n\n')
        
    df_results = pd.concat([df_results, pd.DataFrame([result_row])], ignore_index=True)

*** Textblob ***
Testing Accuracy: 0.64
f1_score: 0.5714285714285714
Accuracy: 0.64
Precision: 0.6556
Recall: 0.64
F1 Score: 0.6469
              precision    recall  f1-score   support

    positive       0.38      0.43      0.40         7
    negative       0.76      0.72      0.74        18

    accuracy                           0.64        25
   macro avg       0.57      0.58      0.57        25
weighted avg       0.66      0.64      0.65        25

[[ 3  4]
 [ 5 13]]
________________________________________________________________________________________________________________________




*** Afinn ***


  df_results = pd.concat([df_results, pd.DataFrame([result_row])], ignore_index=True)


Testing Accuracy: 0.6
f1_score: 0.5941558441558441
Accuracy: 0.6
Precision: 0.76
Recall: 0.6
F1 Score: 0.6156
              precision    recall  f1-score   support

    positive       0.40      0.86      0.55         7
    negative       0.90      0.50      0.64        18

    accuracy                           0.60        25
   macro avg       0.65      0.68      0.59        25
weighted avg       0.76      0.60      0.62        25

[[6 1]
 [9 9]]
________________________________________________________________________________________________________________________




*** Sentiword ***
Testing Accuracy: 0.72
f1_score: 0.7028862478777589
Accuracy: 0.72
Precision: 0.8046
Recall: 0.72
F1 Score: 0.7343
              precision    recall  f1-score   support

    positive       0.50      0.86      0.63         7
    negative       0.92      0.67      0.77        18

    accuracy                           0.72        25
   macro avg       0.71      0.76      0.70        25
weighted avg      

In [30]:
print('Text processing ended at {}'.format(datetime.now()))

Text processing ended at 2025-04-16 21:04:27.730882


In [31]:
df_results

Unnamed: 0,Unsupervised Model,Testing score,f1_score
0,Textblob,0.64,0.571
1,Afinn,0.6,0.594
2,Sentiword,0.72,0.703
3,Vader,0.64,0.571
