# Loading

And some additional preprocessing:
- Binning
- Tokenization
- Resampling (Class Balancing)

In [2]:
import pandas as pd
import numpy as np
import ast

from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess

def load_bunch() -> dict[pd.DataFrame]:
    apex = pd.read_csv('processed_data/apex_ad2600_dvd_player_updated.csv')
    canon = pd.read_csv('processed_data/canon_g3_updated.csv')
    nikon = pd.read_csv('processed_data/nikon_coolpix_4300_updated.csv')
    nokia = pd.read_csv('processed_data/nokia_6610_updated.csv')
    nomad = pd.read_csv('processed_data/nomad_jukebox_zen_xtra_updated.csv')
    return {
        "apex": apex,
        "canon": canon,
        "nikon": nikon,
        "nokia": nokia,
        "nomad": nomad
    }

def get_master_df(sentiments_only: bool = True) -> pd.DataFrame:
    bunch = load_bunch()
    master_df = pd.concat(bunch.values(), ignore_index=True)
    master_df['sentiment_dict'] = master_df['sentiment_dict'].apply(ast.literal_eval)
    if sentiments_only:
        master_df = master_df[master_df['sentiment_dict'].apply(lambda x: bool(x))]
    return master_df

master_df = get_master_df(sentiments_only = False)

# Binning as negative neutral positive
# Define conditions
conditions = [
    master_df['sentiment_total'] > 0,  # Positive sentiment
    master_df['sentiment_total'] < 0,  # Negative sentiment
    master_df['sentiment_total'] == 0  # Neutral sentiment
]

# Define corresponding labels
labels = ['positive', 'negative', 'neutral']

# Create a new column for binned sentiment
master_df['sentiment_category'] = np.select(conditions, labels)
master_df['sentiment_category'].value_counts()

# Tokenization and removal of stopwords
master_df['sentence'] = master_df['sentence'].apply(lambda x: remove_stopwords(str(x)))
master_df['tokenized_sentence'] = master_df['sentence'].apply(simple_preprocess)


Balancing

In [3]:
from sklearn.utils import resample

positive_df = master_df[master_df['sentiment_category'] == 'positive']
negative_df = master_df[master_df['sentiment_category'] == 'negative']
neutral_df = master_df[master_df['sentiment_category'] == 'neutral']

max_size = 2000

# Resample each class to match the majority class size
positive_upsampled = resample(positive_df, replace=True, n_samples=max_size, random_state=42)
negative_upsampled = resample(negative_df, replace=True, n_samples=max_size, random_state=42)
neutral_upsampled = resample(neutral_df, replace=True, n_samples=max_size, random_state=42)

# Combine the upsampled dataframes
balanced_master_df = pd.concat([positive_upsampled, negative_upsampled, neutral_upsampled])

# Shuffle the dataset
balanced_master_df = balanced_master_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new class distribution
print(balanced_master_df['sentiment_category'].value_counts())
display(balanced_master_df.head())

sentiment_category
positive    2000
negative    2000
neutral     2000
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,title,sentence,sentiment_dict,sentiment_total,[u],[p],[s],[cc],[cs],annotations,title_input_ids,title_attention_mask,sentence_input_ids,sentence_attention_mask,sentiment_category,tokenized_sentence
0,1526,a great player excellent sound quality hovewer...,know people software awesome,{'software': 3},3,False,False,False,False,False,software[+3],"[101, 1037, 2307, 2447, 6581, 2614, 3737, 2521...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[101, 1045, 2079, 2025, 2113, 2054, 2060, 2111...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",positive,"[know, people, software, awesome]"
1,55,bad service,send camera nikon service 6 week diagnose problem,{'servicing': -2},-2,False,False,False,False,False,servicing[-2],"[101, 2919, 2326, 102, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 1045, 4604, 2026, 4950, 2000, 23205, 223...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",negative,"[send, camera, nikon, service, week, diagnose,..."
2,269,so far it s sweet,s easy pop open song listen,{'case': 2},2,False,True,False,False,False,case[+2][p],"[101, 2061, 2521, 2009, 1055, 4086, 102, 0, 0,...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 2009, 1055, 3733, 2438, 2000, 3769, 2009...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",positive,"[easy, pop, open, song, listen]"
3,287,no picture and or no sound try ip button on th...,product month yesterday stop work,{'product': -3},-3,True,False,False,False,False,product[-3][u],"[101, 2053, 3861, 1998, 2030, 2053, 2614, 3046...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[101, 2057, 2031, 2031, 2023, 4031, 2005, 2058...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",negative,"[product, month, yesterday, stop, work]"
4,34,good phone so-so service,reason ?,{},0,False,False,False,False,False,,"[101, 2204, 3042, 2061, 1011, 2061, 2326, 102,...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[101, 3114, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",neutral,[reason]


# Training a Word2Vec Model

In [4]:
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import time

train_df, test_df = train_test_split(balanced_master_df, test_size=0.2, random_state=1111)

start_time = time.time()

# Train the Word2Vec Model
w2v_model = Word2Vec(
    sentences=train_df['tokenized_sentence'],
    vector_size=150,
    workers=3,
    window=3,
    min_count=1,
    sg=1, # Skip Gram
)
print("Time taken to train word2vec model: " + str(time.time() - start_time))

Time taken to train word2vec model: 0.5068459510803223


Creating sentence vectors from word embeddings

In [5]:
def sentence_vector(sentence, model):
    valid_words = [word for word in sentence if word in model.wv]
    if len(valid_words) == 0:
        return np.zeros(model.vector_size)  # Return zero vector if no valid words
    return np.mean([model.wv[word] for word in valid_words], axis=0) # Takes the average of all the word vectors to create a sentence vector

train_df['sentence_vector'] = train_df['tokenized_sentence'].apply(lambda x: sentence_vector(x, w2v_model))
test_df['sentence_vector'] = test_df['tokenized_sentence'].apply(lambda x: sentence_vector(x, w2v_model))

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score, f1_score

training_vectors = np.vstack(train_df['sentence_vector'])
training_categories = train_df['sentiment_category']

testing_vectors = np.vstack(test_df['sentence_vector'])
testing_categories = test_df['sentiment_category']

models = {
    "Decision Tree": DecisionTreeClassifier(random_state=1111),
    "Logistic Regression": LogisticRegression(random_state=1111),
    "Random Forest": RandomForestClassifier(random_state=1111)
}

# Train and evaluate models
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print(f"\n{model_name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print(f"{model_name} F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}\n")
        print(f"{model_name} Classification Report:\n{classification_report(y_test, y_pred)}\n")

train_and_evaluate(models, training_vectors, training_categories, testing_vectors, testing_categories)

Training Decision Tree...

Decision Tree Accuracy: 0.8908
Decision Tree F1-Score: 0.8899

Decision Tree Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.96      0.94       398
     neutral       0.88      0.81      0.85       386
    positive       0.88      0.89      0.89       416

    accuracy                           0.89      1200
   macro avg       0.89      0.89      0.89      1200
weighted avg       0.89      0.89      0.89      1200


Training Logistic Regression...

Logistic Regression Accuracy: 0.4783
Logistic Regression F1-Score: 0.4802

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.48      0.50      0.49       398
     neutral       0.41      0.45      0.43       386
    positive       0.56      0.48      0.52       416

    accuracy                           0.48      1200
   macro avg       0.48      0.48      0.48      1200
weighted avg  

# Training a Doc2Vec Model

In [9]:
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

import random

train_df, test_df = train_test_split(balanced_master_df, test_size=0.2, random_state=1111)

d2v_model = Doc2Vec(
    vector_size=100,
    workers=3,
    window=5,
    min_count=1,
    epochs=40
)

def tag_sentence(tokenized_sentence, tag):
    return TaggedDocument(tokenized_sentence, tag)
tagged_sentences = []

pos_count = 0
neg_count = 0
neu_count = 0
for _, row in train_df.iterrows():
    if row['sentiment_category'] == 'positive':
        pos_count += 1
        tagged_sentence = tag_sentence(row['tokenized_sentence'], ["POS_" + str(pos_count)])
    elif row['sentiment_category'] == 'negative':
        neg_count += 1
        tagged_sentence = tag_sentence(row['tokenized_sentence'], ["NEG_" + str(neg_count)])
    elif row['sentiment_category'] == 'neutral':
        neu_count += 1
        tagged_sentence = tag_sentence(row['tokenized_sentence'], ["NEU_" + str(neu_count)])
    tagged_sentences.append(tagged_sentence)

d2v_model.build_vocab(tagged_sentences)

In [10]:
def sentences_perm(sentences):
    shuffled = list(sentences)
    random.shuffle(shuffled)
    return shuffled

d2v_model.train(tagged_sentences, total_examples=d2v_model.corpus_count, epochs=40)

In [11]:
print(pos_count)
print(neg_count)
print(neu_count)

d2v_model.wv.most_similar("feel")

1584
1602
1614


[('solid', 0.8693687915802002),
 ('flimsy', 0.8567082285881042),
 ('plastic', 0.8540900945663452),
 ('uneasy', 0.8526702523231506),
 ('cute', 0.8403142094612122),
 ('comfy', 0.8125000596046448),
 ('tiny', 0.8041970133781433),
 ('mechanically', 0.7985464334487915),
 ('autofocus', 0.7975852489471436),
 ('awkward', 0.7911451458930969)]

In [12]:
train_array = []
train_labels = []

for i in range(pos_count):
    prefix_train = 'POS_' + str(i + 1)
    train_array.append(d2v_model[prefix_train])
    train_labels.append(1)

for i in range(neg_count):
    prefix_train = 'NEG_' + str(i + 1)
    train_array.append(d2v_model[prefix_train])
    train_labels.append(-1)

for i in range(neu_count):
    prefix_train = 'NEU_' + str(i + 1)
    train_array.append(d2v_model[prefix_train])
    train_labels.append(0)

In [13]:
test_array = []
test_labels = []

for _, row in test_df.iterrows():
    test_vector = d2v_model.infer_vector(row['tokenized_sentence'])  # Infer vector for test sentence
    test_array.append(test_vector)
    if row['sentiment_category'] == 'positive':
        test_labels.append(1)
    elif row['sentiment_category'] == 'negative':
        test_labels.append(-1)
    elif row['sentiment_category'] == 'neutral':
        test_labels.append(0)


# # Calculate class weights
# class_weights = compute_class_weight(
#     class_weight='balanced',  # Balances weights inversely proportional to class frequencies
#     classes=np.unique(train_labels),  # Unique class labels
#     y=train_labels                # Training labels
# )

# # Convert to dictionary format for use in classifiers
# class_weight_dict = {label: weight for label, weight in zip(np.unique(train_labels), class_weights)}
# print("Class Weights:", class_weight_dict)

# Decision Tree
# classifier = DecisionTreeClassifier(class_weight='balanced')

#------------------------------

# models = {
#     "Decision Tree": DecisionTreeClassifier(random_state=42),
#     "Logistic Regression": LogisticRegression(random_state=42),
#     "Random Forest": RandomForestClassifier(random_state=42)
# }

# classifier = DecisionTreeClassifier()
# classifier.fit(train_array, train_labels)
# y_pred = classifier.predict(test_array)
# print("Decision Tree Accuracy:", accuracy_score(test_labels, y_pred))
# print("Classification Report:\n", classification_report(test_labels, y_pred))

# # Logistic Regression
# # classifier = LogisticRegression(class_weight='balanced')
# classifier = LogisticRegression()
# classifier.fit(train_array, train_labels)
# y_pred = classifier.predict(test_array)
# print("Logistic Regression Accuracy:", accuracy_score(test_labels, y_pred))
# print("Classification Report:\n", classification_report(test_labels, y_pred))


# # Random Forest
# # classifier = RandomForestClassifier(class_weight='balanced')
# classifier = RandomForestClassifier()
# classifier.fit(train_array, train_labels)
# y_pred = classifier.predict(test_array)
# print("Random Forest Accuracy:", accuracy_score(test_labels, y_pred))
# print("Classification Report:\n", classification_report(test_labels, y_pred))

train_and_evaluate(models, train_array, train_labels, test_array, test_labels)

Training Decision Tree...

Decision Tree Accuracy: 0.4100
Decision Tree F1-Score: 0.4108

Decision Tree Classification Report:
              precision    recall  f1-score   support

          -1       0.46      0.43      0.44       398
           0       0.36      0.40      0.38       386
           1       0.42      0.40      0.41       416

    accuracy                           0.41      1200
   macro avg       0.41      0.41      0.41      1200
weighted avg       0.41      0.41      0.41      1200


Training Logistic Regression...

Logistic Regression Accuracy: 0.5575
Logistic Regression F1-Score: 0.5563

Logistic Regression Classification Report:
              precision    recall  f1-score   support

          -1       0.55      0.60      0.58       398
           0       0.52      0.47      0.50       386
           1       0.59      0.59      0.59       416

    accuracy                           0.56      1200
   macro avg       0.56      0.56      0.56      1200
weighted avg  

# GloVe pretrained model

In [14]:
import gensim.downloader as api
model = api.load("glove-twitter-25")
model.most_similar(positive=['fruit', 'flower'], topn=1)

[('cherry', 0.9183273911476135)]

In [16]:
import gensim.downloader as api
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load GloVe model
# model = api.load("glove-twitter-25")

# Sample data
tokenized_sentences = balanced_master_df['tokenized_sentence']  # Your tokenized sentences
sentiment_labels = balanced_master_df['sentiment_category']     # Corresponding sentiment labels

# Generate sentence embeddings
def sentence_embedding(sentence_tokens, model):
    embeddings = [model[word] for word in sentence_tokens if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

X = [sentence_embedding(tokens, model) for tokens in tokenized_sentences]
y = sentiment_labels

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Decision Tree
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):4f}")
print("F1: ", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

# Train the classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):4f}")
print("F1: ", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))


clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):4f}")
print("F1: ", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

Accuracy: 0.855000
F1:  0.8534687625641862
              precision    recall  f1-score   support

    negative       0.87      0.93      0.90       400
     neutral       0.87      0.76      0.81       400
    positive       0.83      0.88      0.85       400

    accuracy                           0.85      1200
   macro avg       0.86      0.85      0.85      1200
weighted avg       0.86      0.85      0.85      1200

Accuracy: 0.499167
F1:  0.4981465550647136
              precision    recall  f1-score   support

    negative       0.49      0.51      0.50       400
     neutral       0.49      0.44      0.46       400
    positive       0.52      0.55      0.54       400

    accuracy                           0.50      1200
   macro avg       0.50      0.50      0.50      1200
weighted avg       0.50      0.50      0.50      1200

Accuracy: 0.895000
F1:  0.8945353476771345
              precision    recall  f1-score   support

    negative       0.93      0.94      0.94       400
