# Loading

In [115]:
import pandas as pd
import ast

def load_bunch() -> dict[pd.DataFrame]:
    apex = pd.read_csv('processed_data/apex_ad2600_dvd_player_updated.csv')
    canon = pd.read_csv('processed_data/canon_g3_updated.csv')
    nikon = pd.read_csv('processed_data/nikon_coolpix_4300_updated.csv')
    nokia = pd.read_csv('processed_data/nokia_6610_updated.csv')
    nomad = pd.read_csv('processed_data/nomad_jukebox_zen_xtra_updated.csv')
    return {
        "apex": apex,
        "canon": canon,
        "nikon": nikon,
        "nokia": nokia,
        "nomad": nomad
    }

def get_master_df(sentiments_only: bool = True) -> pd.DataFrame:
    bunch = load_bunch()
    master_df = pd.concat(bunch.values(), ignore_index=True)
    master_df['sentiment_dict'] = master_df['sentiment_dict'].apply(ast.literal_eval)
    if sentiments_only:
        master_df = master_df[master_df['sentiment_dict'].apply(lambda x: bool(x))]
    return master_df

master_df = get_master_df(sentiments_only = False)
print(master_df.shape)
# display(master_df.head(3))


# total = 0
# bunch: dict = load_bunch()
# for name, df in bunch.items():
#     df['sentiment_dict'] = df['sentiment_dict'].apply(ast.literal_eval)
#     all_sentiment = df[df['sentiment_dict'].apply(lambda x: bool(x))]
#     print("Total sentences: ", df.shape[0])
#     print("Sentences with sentiment: ", all_sentiment.shape[0])
#     total += all_sentiment.shape[0]
# print(total)


(3943, 15)


# Analyze the sentiment distribution

In [116]:
import numpy as np
# Binning as negative neutral positive
# Define conditions
conditions = [
    master_df['sentiment_total'] > 0,  # Positive sentiment
    master_df['sentiment_total'] < 0,  # Negative sentiment
    master_df['sentiment_total'] == 0  # Neutral sentiment
]

# Define corresponding labels
labels = ['positive', 'negative', 'neutral']

# Create a new column for binned sentiment
master_df['sentiment_category'] = np.select(conditions, labels)
master_df['sentiment_category'].value_counts()

sentiment_category
neutral     2223
positive    1082
negative     638
Name: count, dtype: int64

# Tokenize and remove stopwords

In [117]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess

master_df['sentence'] = master_df['sentence'].apply(lambda x: remove_stopwords(str(x)))
master_df['tokenized_sentence'] = master_df['sentence'].apply(simple_preprocess)

# Training a Word2Vec Model

In [127]:
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import time

train_df, test_df = train_test_split(master_df, test_size=0.2, random_state=1111)

start_time = time.time()

# Train the Word2Vec Model
w2v_model = Word2Vec(
    sentences=train_df['tokenized_sentence'],
    vector_size=100,
    workers=3,
    window=3,
    min_count=1,
    sg=1, # Skip Gram
)
print("Time taken to train word2vec model: " + str(time.time() - start_time))

Time taken to train word2vec model: 0.4632248878479004


Creating sentence vectors from word embeddings

In [None]:
def sentence_vector(sentence, model):
    valid_words = [word for word in sentence if word in model.wv]
    if len(valid_words) == 0:
        return np.zeros(model.vector_size)  # Return zero vector if no valid words
    return np.mean([model.wv[word] for word in valid_words], axis=0)

train_df['sentence_vector'] = train_df['tokenized_sentence'].apply(lambda x: sentence_vector(x, w2v_model))
test_df['sentence_vector'] = test_df['tokenized_sentence'].apply(lambda x: sentence_vector(x, w2v_model))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Training and testing data
X_train = np.vstack(train_df['sentence_vector'])
y_train = train_df['sentiment_category']
X_test = np.vstack(test_df['sentence_vector'])
y_test = test_df['sentiment_category']

# Class weight calculation for balanced models
class_weights = compute_class_weight(
    'balanced', classes=np.unique(y_train), y=y_train
)

# Using 3 models
models = {
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "Logistic Regression": LogisticRegression(class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42)
}

# Train and evaluate models
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"\n{model_name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print(f"{model_name} Classification Report:\n{classification_report(y_test, y_pred)}\n")

train_and_evaluate(models, X_train, y_train, X_test, y_test)

{0: 2.065487884741323, 1: 0.5893124065769806, 2: 1.221060782036392}
Training Decision Tree...

Decision Tree Accuracy: 0.4639
Decision Tree Classification Report:
              precision    recall  f1-score   support

    negative       0.20      0.18      0.19       129
     neutral       0.58      0.61      0.59       439
    positive       0.35      0.35      0.35       221

    accuracy                           0.46       789
   macro avg       0.38      0.38      0.38       789
weighted avg       0.46      0.46      0.46       789


Training Logistic Regression...

Logistic Regression Accuracy: 0.4195
Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.25      0.16      0.19       129
     neutral       0.63      0.39      0.48       439
    positive       0.32      0.63      0.42       221

    accuracy                           0.42       789
   macro avg       0.40      0.39      0.37       789
weighted avg    

# Training a Doc2Vec Model

In [None]:
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

import random

train_df, test_df = train_test_split(master_df, test_size=0.2, random_state=1337)

d2v_model = Doc2Vec(
    vector_size=100,
    workers=3,
    window=3,
    min_count=1,
    epochs=20
)

def tag_sentence(tokenized_sentence, tag):
    return TaggedDocument(tokenized_sentence, tag)
# tag_sentence(utils.simple_preprocess("this is a sentence"), [1])
tagged_sentences = []

pos_count = 0
neg_count = 0
neu_count = 0
for _, row in train_df.iterrows():
    if row['sentiment_category'] == 'positive':
        pos_count += 1
        tagged_sentence = tag_sentence(row['tokenized_sentence'], ["POS_" + str(pos_count)])
    elif row['sentiment_category'] == 'negative':
        neg_count += 1
        tagged_sentence = tag_sentence(row['tokenized_sentence'], ["NEG_" + str(neg_count)])
    elif row['sentiment_category'] == 'neutral':
        neu_count += 1
        tagged_sentence = tag_sentence(row['tokenized_sentence'], ["NEU_" + str(neu_count)])
    tagged_sentences.append(tagged_sentence)

d2v_model.build_vocab(tagged_sentences)

In [218]:
def sentences_perm(sentences):
    shuffled = list(sentences)
    random.shuffle(shuffled)
    return shuffled

for epoch in range(10):
    d2v_model.train(sentences_perm(tagged_sentences), total_examples=d2v_model.corpus_count, epochs=10)

In [219]:
print(pos_count)
print(neg_count)
print(neu_count)

d2v_model.wv.most_similar("feel")

854
514
1786


[('plastic', 0.7817115187644958),
 ('uneasy', 0.7758155465126038),
 ('solid', 0.7733110785484314),
 ('mechanically', 0.7115648984909058),
 ('comfy', 0.6967466473579407),
 ('flimsy', 0.6855295896530151),
 ('sh', 0.6738451719284058),
 ('dense', 0.6725516319274902),
 ('sounding', 0.668586790561676),
 ('substance', 0.6619100570678711)]

In [220]:
train_array = []
train_labels = []

for i in range(pos_count):
    prefix_train = 'POS_' + str(i + 1)
    train_array.append(d2v_model[prefix_train])
    train_labels.append(1)

for i in range(neg_count):
    prefix_train = 'NEG_' + str(i + 1)
    train_array.append(d2v_model[prefix_train])
    train_labels.append(-1)

for i in range(neu_count):
    prefix_train = 'NEU_' + str(i + 1)
    train_array.append(d2v_model[prefix_train])
    train_labels.append(0)

In [221]:
test_array = []
test_labels = []

for _, row in test_df.iterrows():
    test_vector = d2v_model.infer_vector(row['tokenized_sentence'])  # Infer vector for test sentence
    test_array.append(test_vector)
    if row['sentiment_category'] == 'positive':
        test_labels.append(1)
    elif row['sentiment_category'] == 'negative':
        test_labels.append(-1)
    elif row['sentiment_category'] == 'neutral':
        test_labels.append(0)


# Calculate class weights
class_weights = compute_class_weight(
    class_weight='balanced',  # Balances weights inversely proportional to class frequencies
    classes=np.unique(train_labels),  # Unique class labels
    y=train_labels                # Training labels
)

# Convert to dictionary format for use in classifiers
class_weight_dict = {label: weight for label, weight in zip(np.unique(train_labels), class_weights)}
print("Class Weights:", class_weight_dict)

# Logistic Regression
classifier = LogisticRegression(class_weight='balanced')
classifier.fit(train_array, train_labels)
y_pred = classifier.predict(test_array)
print("Logistic Regression Accuracy:", accuracy_score(test_labels, y_pred))
print("Classification Report:\n", classification_report(test_labels, y_pred))

# Decision Tree
classifier = DecisionTreeClassifier(class_weight='balanced')
classifier.fit(train_array, train_labels)
y_pred = classifier.predict(test_array)
print("Decision Tree Accuracy:", accuracy_score(test_labels, y_pred))
print("Classification Report:\n", classification_report(test_labels, y_pred))

# Random Forest
classifier = RandomForestClassifier(class_weight='balanced')
classifier.fit(train_array, train_labels)
y_pred = classifier.predict(test_array)
print("Random Forest Accuracy:", accuracy_score(test_labels, y_pred))
print("Classification Report:\n", classification_report(test_labels, y_pred))

Class Weights: {-1: 2.0453955901426717, 0: 0.5886524822695035, 1: 1.2310694769711163}
Logistic Regression Accuracy: 0.5640050697084917
Classification Report:
               precision    recall  f1-score   support

          -1       0.38      0.05      0.09       124
           0       0.56      0.98      0.72       437
           1       0.91      0.04      0.08       228

    accuracy                           0.56       789
   macro avg       0.62      0.36      0.29       789
weighted avg       0.63      0.56      0.43       789

Decision Tree Accuracy: 0.5069708491761724
Classification Report:
               precision    recall  f1-score   support

          -1       0.07      0.02      0.04       124
           0       0.57      0.80      0.67       437
           1       0.34      0.20      0.25       228

    accuracy                           0.51       789
   macro avg       0.33      0.34      0.32       789
weighted avg       0.43      0.51      0.45       789

Random Fores

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Giving up... GLOVE pretrained try

In [223]:
import gensim.downloader as api
model = api.load("glove-twitter-25")
model.most_similar(positive=['fruit', 'flower'], topn=1)



[('cherry', 0.9183273911476135)]

In [228]:
import gensim.downloader as api
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load GloVe model
# model = api.load("glove-twitter-25")

# Sample data
tokenized_sentences = master_df['tokenized_sentence']  # Your tokenized sentences
sentiment_labels = master_df['sentiment_category']     # Corresponding sentiment labels

# Generate sentence embeddings
def sentence_embedding(sentence_tokens, model):
    embeddings = [model[word] for word in sentence_tokens if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

X = [sentence_embedding(tokens, model) for tokens in tokenized_sentences]
y = sentiment_labels

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train the classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.6070975918884665
              precision    recall  f1-score   support

    negative       0.29      0.04      0.07       128
     neutral       0.63      0.88      0.73       445
    positive       0.55      0.39      0.46       216

    accuracy                           0.61       789
   macro avg       0.49      0.43      0.42       789
weighted avg       0.55      0.61      0.55       789

Accuracy:  0.4752851711026616
              precision    recall  f1-score   support

    negative       0.20      0.22      0.21       128
     neutral       0.61      0.60      0.60       445
    positive       0.38      0.38      0.38       216

    accuracy                           0.48       789
   macro avg       0.40      0.40      0.40       789
weighted avg       0.48      0.48      0.48       789

Accuracy:  0.6197718631178707
              precision    recall  f1-score   support

    negative       0.21      0.02      0.04       128
     neutral       0.63      0.89      