In [19]:
import tba3102
import pandas as pd
import numpy as np

In [20]:
tba3102.set_default_pandas_options()

In [None]:
# file_name = "cleaned-galaxy_cluster_word2vec"
file_name = "cleaned-apple_review_cluster_word2vec"
# file_name = "cleaned-samsung_vs_pixel_cluster_word2vec"
df = pd.read_csv(f'../data/results/{file_name}.csv')
df['Cleaned_Comment'].replace(r'^(\s)+$', np.nan, regex=True, inplace=True)
df = df.dropna(subset=['Cleaned_Comment']).reset_index(drop=True)
df = df[df["Cleaned_Comment"].apply(lambda x: len(x) > 2)]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cleaned_Comment'].replace(r'^(\s)+$', np.nan, regex=True, inplace=True)


In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,Comment ID,Comment Body,Author,Upvotes,Timestamp,Cleaned_Comment,ClusterLabel
0,0,m5u0cnr,"""Presenting, the new s25 series, now with the ...",EVD27,213,1736231000.0,present new series new gap camera bump well di...,50
1,1,m5ttncm,"More like s24 ultra rounded version, given the...",Miyukicc,67,1736227000.0,like ultra rounded version give minimal speck ...,22
2,2,m5tix2l,Same phone in a slightly new body (ultra) \n\n...,judgedavid90,397,1736223000.0,phone slightly new body ultra give I shit chip...,32
3,3,m5tahhm,"Looks great, now only if they would fix their ...",One-Patience-1187,83,1736220000.0,look great would fix camera oversaturate overe...,50
4,4,m5tut4z,The trade in deals better be out of this world,UrDoinGood2,24,1736228000.0,trade deal well world,46


In [23]:
from nltk.corpus import sentiwordnet as swn
import spacy
nlp = spacy.load('en_core_web_sm')

def analyze_sentiment_sentiwordnet_lexicon(review, verbose=False):

    # tokenize and POS tag text tokens
    tagged_text = [(token.text, token.tag_) for token in nlp(review)]
    pos_score = neg_score = token_count = obj_score = 0
    # get wordnet synsets based on POS tags
    # get sentiment scores if synsets are found
    for word, tag in tagged_text:

        ss_set = None
        if 'NN' in tag and list(swn.senti_synsets(word, 'n')):
            ss_set = list(swn.senti_synsets(word, 'n'))[0]
        elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):
            ss_set = list(swn.senti_synsets(word, 'v'))[0]
        elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):
            ss_set = list(swn.senti_synsets(word, 'a'))[0]
        elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):
            ss_set = list(swn.senti_synsets(word, 'r'))[0]

        # if senti-synset is found
        if ss_set:

            # add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1

    # aggregate final scores
    if token_count == 0:
        # print("Warning: No tokens were processed for sentiment analysis.")
        return 0  # or handle this case as needed

    final_score = pos_score - neg_score
    # print("Token Count:", token_count)
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'

    if verbose:

        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)

        print('SENTIMENT STATS:')
        print('Predicted Sentiment', final_sentiment)
        print('Objectivity', norm_obj_score)
        print('Positive', norm_pos_score)
        print('Negative', norm_neg_score)
        print('Overall', norm_final_score)

    return norm_final_score

In [24]:
df["sentiment_polarity"] = [analyze_sentiment_sentiwordnet_lexicon(review) for review in df["Cleaned_Comment"]]
df["sentiments"] = ['positive' if score >= 0.1 else 'neutral' for score in df["sentiment_polarity"]]
df["sentiments"] = ['negative' if score <= -0.1 else sentiment for score,sentiment in zip(df["sentiment_polarity"],df["sentiments"])]
# print(file_name)
# df["sentiments"].value_counts()

In [25]:
df["sentiment_polarity"].describe()

count    480.000000
mean       0.031167
std        0.107623
min       -0.880000
25%       -0.010000
50%        0.020000
75%        0.070000
max        0.620000
Name: sentiment_polarity, dtype: float64

In [26]:
df = df[(df["sentiments"] == "negative") | (df["sentiments"] == "positive")]
df = df.sort_values(by=["ClusterLabel","sentiments"]).reset_index(drop=True)
print(df["sentiments"].value_counts())
total = df["sentiments"].count()
pos = (df["sentiments"] == "positive").sum()
# print(pos)
print(pos/total)
print(1-pos/total)
df.head()

sentiments
positive    95
negative    27
Name: count, dtype: int64
0.7786885245901639
0.2213114754098361


Unnamed: 0.1,Unnamed: 0,Comment ID,Comment Body,Author,Upvotes,Timestamp,Cleaned_Comment,ClusterLabel,sentiment_polarity,sentiments
0,163,m6d2j46,more of the same old boring,sheldonxp2000,1,1736490000.0,old bore,0,0.12,positive
1,14,m5v8azi,The wallpapers are beautiful,UnhappyAd7832,3,1736256000.0,wallpaper beautiful,1,0.38,positive
2,180,m6xourd,Is it only me or someone else HATE those round...,FlacFanDAC,1,1736783000.0,I someone else hate round corner,2,-0.12,negative
3,15,m5y9a9f,All I want is Qi2 🥺,Fezzicc,3,1736289000.0,I want I,2,0.25,positive
4,243,m5vsouy,I'm pretty sure the elite chip is meant to be ...,ImawhaleCR,1,1736263000.0,I pretty sure elite chip mean fair bit efficie...,2,0.12,positive


galaxy -> 78%% positive
apple -> 67% positive

from this we can tell galaxy receives movre positive feedback on their phone products

In [27]:
df.to_csv(f"../data/results/{file_name}_with_sentiments.csv")