In [None]:
#Used Pytorch for execution
import re
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('stopwords')

import re
import unicodedata
from sklearn.model_selection import train_test_split
import scipy
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB
import pickle
from sklearn.metrics import accuracy_score

In [None]:
data= pd.read_csv('/data.tsv',
                  sep='\t', on_bad_lines='skip', usecols=['star_rating','review_headline', 'review_body'],
                  memory_map=True)



In [3]:
data.head()

Unnamed: 0,star_rating,review_headline,review_body
0,5,Five Stars,Great product.
1,5,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...
2,5,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it."
3,1,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...
4,4,Four Stars,Gorgeous colors and easy to use


In [4]:
data_req = data.loc[:, ['review_body', 'star_rating']]
data_req.head()

Unnamed: 0,review_body,star_rating
0,Great product.,5
1,What's to say about this commodity item except...,5
2,"Haven't used yet, but I am sure I will like it.",5
3,Although this was labeled as &#34;new&#34; the...,1
4,Gorgeous colors and easy to use,4


In [5]:
data_req['star_rating']= pd.to_numeric(data['star_rating'], errors='coerce')
data_req['star_rating'].unique()

array([ 5.,  1.,  4.,  2.,  3., nan])

In [6]:
data_req.dropna(subset=["star_rating"], inplace=True)
data_req['star_rating'].unique()

array([5., 1., 4., 2., 3.])

In [7]:
data_req = data_req.dropna(subset=['review_body'])

In [8]:
balanced_data = data_req.groupby('star_rating').apply(lambda x: x.sample(n=50000, random_state=42)).reset_index(drop=True)
balanced_data.head()

  balanced_data = data_req.groupby('star_rating').apply(lambda x: x.sample(n=50000, random_state=42)).reset_index(drop=True)


Unnamed: 0,review_body,star_rating
0,The photo is deceiving - makes it look like a ...,1.0
1,Worst labels ever! I purchased these labels to...,1.0
2,This product broke in a very short time. It a...,1.0
3,The printer head is malfunctioning since the i...,1.0
4,When this item shipped to me I was very excite...,1.0


In [9]:
balanced_data['sentiment'] = balanced_data['star_rating'].apply(lambda x: 1 if x >= 4 else (2 if x <= 2 else 3))
print(balanced_data['star_rating'].value_counts())
print(balanced_data['sentiment'].value_counts())


star_rating
1.0    50000
2.0    50000
3.0    50000
4.0    50000
5.0    50000
Name: count, dtype: int64
sentiment
2    100000
1    100000
3     50000
Name: count, dtype: int64


In [10]:
#Expanding short forms

contractions_dict = { 'arent': 'are not', 'wont': 'will not', 'cant': 'can not', 'dont': 'do not', 'shant' : "shall not",
                     'wouldnt': 'would not', 'couldnt': 'could not', 'shouldnt': 'should not', 'isnt':'is not', 'im':'i am', 'mustnt': 'must not',
                     'didnt': 'did not', 'doesnt': 'does not',
                     'theyre': "they are"}
#character with diacritics
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c)!='Mn')
#Using the contraction function
def expand_contractions(text):
    if isinstance(text, str):
        for word in contractions_dict:
            text = text.replace(word, contractions_dict[word])
        return text



def cleaned_text(text):
    text = unicode_to_ascii(text.lower().strip())

    #URLs
    text = re.sub(r"\bhttps?:\/\/\S+|www\.\S+", " ", text)
    text = re.sub(
        r"[a-zA-Z0-9_\-\.]+@[a-zA-Z0-9_\-\.]+\.[a-zA-Z]{2,5}", " ", text
    )
    # Remove HTML tags with empty string
    text = re.sub(r"<.*?>", "", text)
    text = expand_contractions(text)
    # removes all non-alphabetical characters

    text = re.sub(r"[^a-zA-Z\s]+", "", text)
    # creating a space between a word and the punctuation following it
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    # remove extra spaces
    text = re.sub(" +", " ", text)

    text = text.strip()
    return text

final_clean = np.vectorize(cleaned_text)



In [11]:
balanced_data["review_body"] =balanced_data["review_body"].apply(final_clean)

In [12]:
req_stpwds = set(stopwords.words('english'))
#stop words removal and lemmatization
pattern = re.compile(r'\b('+r'|'.join(req_stpwds)+r')\b\s*')

def lem_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words= [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def preprocess_text(text):
    text = pattern.sub('', text)
    text = lem_text(text)
    return text
preprocess_text_vect = np.vectorize(preprocess_text)

In [13]:
balanced_data["review_body"]= balanced_data["review_body"].apply(preprocess_text_vect)

In [14]:
balanced_data.head()

Unnamed: 0,review_body,star_rating,sentiment
0,photo deceiving make look like set pen fact on...,1.0,2
1,worst label ever purchased label try reading r...,1.0,2
2,product broke short ti ame also poor job getti...,1.0,2
3,printer head malfunctioning since installation...,1.0,2
4,item shipped excited outside great quality loo...,1.0,2


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Create TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(lowercase=True, max_features=1000)

# Filter positive and negative reviews
pos_reviews = balanced_data[balanced_data['sentiment'] == 1]
neg_reviews = balanced_data[balanced_data['sentiment'] == 2]
neu_reviews = balanced_data[balanced_data['sentiment'] == 3]

# Fit and transform the entire dataset
tfidf_matrix = tfidf_vectorizer.fit_transform(balanced_data['review_body'])

# Transform each subset
pos_tfidf = tfidf_vectorizer.transform(pos_reviews['review_body'])
neg_tfidf = tfidf_vectorizer.transform(neg_reviews['review_body'])
neu_tfidf = tfidf_vectorizer.transform(neu_reviews['review_body'])

# Split positive reviews
pos_x_train, pos_x_test = train_test_split(pos_tfidf, test_size=0.2, random_state=3)
# Split negative reviews
neg_x_train, neg_x_test = train_test_split(neg_tfidf, test_size=0.2, random_state=3)
# Split neutral reviews
neu_x_train, neu_x_test = train_test_split(neu_tfidf, test_size=0.2, random_state=3)

print(f"Positive reviews - Training set: {pos_x_train.shape}, Testing set: {pos_x_test.shape}")
print(f"Negative reviews - Training set: {neg_x_train.shape}, Testing set: {neg_x_test.shape}")
print(f"Neutral reviews - Training set: {neu_x_train.shape}, Testing set: {neu_x_test.shape}")


Positive reviews - Training set: (80000, 1000), Testing set: (20000, 1000)
Negative reviews - Training set: (80000, 1000), Testing set: (20000, 1000)
Neutral reviews - Training set: (40000, 1000), Testing set: (10000, 1000)


In [16]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [17]:
vec_king = wv['king']


In [18]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'car'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'car'	1.00
'car'	'communism'	0.06


In [19]:
word_vector = wv["king"]  # Example: get the word embedding for "king"
print("First 10 dimensions of 'king':", word_vector[:10])  # Show a part of the vector


First 10 dimensions of 'king': [ 0.12597656  0.02978516  0.00860596  0.13964844 -0.02563477 -0.03613281
  0.11181641 -0.19824219  0.05126953  0.36328125]


In [20]:
result = wv.most_similar(positive=["king", "woman"], negative=["man"])

print(result)



[('queen', 0.7118192911148071), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951), ('crown_prince', 0.549946129322052), ('prince', 0.5377321243286133), ('kings', 0.5236843824386597), ('Queen_Consort', 0.5235944390296936), ('queens', 0.5181134343147278), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]


In [21]:
similarity = wv.similarity("excellent", "outstanding")
print(similarity)

0.5567486


In [22]:
similar_words =wv.most_similar("california")
print(similar_words[:5])

[('south_carolina', 0.6567358374595642), ('arizona', 0.6312313675880432), ('nevada', 0.6222927570343018), ('alabama', 0.6215481162071228), ('utah', 0.6204033493995667)]


In [23]:
print(wv.most_similar(positive=["brother","woman"], negative=["man"]))
print(wv.most_similar(positive=["Microsoft", "iPhone"], negative=["Apple"]))

[('sister', 0.8103213906288147), ('daughter', 0.7646753191947937), ('mother', 0.7524207830429077), ('son', 0.7238258123397827), ('niece', 0.7215942144393921), ('husband', 0.7141482830047607), ('father', 0.7066071629524231), ('aunt', 0.6844728589057922), ('cousin', 0.6844366192817688), ('eldest_daughter', 0.6790661215782166)]
[('Windows_Mobile', 0.6286555528640747), ('Windows_Vista', 0.6119047999382019), ('Windows', 0.6092869639396667), ('Windows_Phone', 0.603961169719696), ('WP7', 0.603832483291626), ('Internet_Explorer', 0.5924732685089111), ('MIcrosoft', 0.5920181274414062), ('Windows_Phone_7', 0.5891107320785522), ('Mircosoft', 0.5824356079101562), ('Microsoft_NSDQ_MSFT', 0.5787074565887451)]


In [24]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np

# Prepare sentences for Word2Vec
sentences = [
    simple_preprocess(str(doc))
    for doc in balanced_data["review_body"]
    if isinstance(doc, str)
]

# Train the Word2Vec model
w2v_model = Word2Vec(
    sentences,
    vector_size=300,
    window=11,
    min_count=10
)

# Check semantic similarities for the same pairs
for w1, w2 in pairs:
    if w1 in w2v_model.wv.key_to_index and w2 in w2v_model.wv.key_to_index:
        print(f"{w1}\t{w2}\t{w2v_model.wv.similarity(w1, w2):.2f}")
    else:
        print(f"{w1}\t{w2}\tOne of these words is not in the vocabulary.")


car	minivan	One of these words is not in the vocabulary.
car	bicycle	0.44
car	airplane	0.64
car	car	1.00
car	communism	One of these words is not in the vocabulary.


In [25]:
# Check most similar words based on analogy
similar_result1 = w2v_model.wv.most_similar(positive=["brother", "woman"], negative=["man"])
print("Most similar words for ['brother', 'woman'] - ['man']:", similar_result1)

similar_result2 = w2v_model.wv.most_similar(positive=["king", "woman"], negative=["man"])
print("Most similar words for ['king', 'woman'] - ['man']:", similar_result2)

# Compute similarity between two words
similarity = w2v_model.wv.similarity("excellent", "outstanding")
print(f"The similarity between 'excellent' and 'outstanding' using our trained model: {similarity}")
similarity2 = w2v_model.wv.similarity("table" , "chair")
print(similarity2)

Most similar words for ['brother', 'woman'] - ['man']: [('mfc', 0.5985559821128845), ('dw', 0.5790482759475708), ('hl', 0.5758002400398254), ('tn', 0.575276792049408), ('mfcjdw', 0.5372292399406433), ('mfcdw', 0.5367903709411621), ('hldw', 0.5294637680053711), ('hldn', 0.5283377766609192), ('mfcjw', 0.5197377800941467), ('xerox', 0.5122743248939514)]
Most similar words for ['king', 'woman'] - ['man']: [('softcover', 0.45056256651878357), ('comfortably', 0.4200192987918854), ('lifestyle', 0.39905422925949097), ('men', 0.39489373564720154), ('sturdiness', 0.39269891381263733), ('allpurpose', 0.3918309807777405), ('salad', 0.38464289903640747), ('popular', 0.38433322310447693), ('variant', 0.3833228051662445), ('capless', 0.3808940052986145)]
The similarity between 'excellent' and 'outstanding' using our trained model: 0.8006961345672607
0.49567968


# Our model performs better as we are feeding only the relevent data.
# in the case of most similar word using semantic similarities the pretrained model performs better.
# for comparing excellent and outstanding our model(self trained) gives better results. 

In [26]:
pos_x_train, pos_x_test = train_test_split(pos_reviews['review_body'], test_size=0.2, random_state=3)
neg_x_train, neg_x_test = train_test_split(neg_reviews['review_body'], test_size=0.2, random_state=3)


In [None]:
# Combine positive and negative reviews for binary classification (Neu reviews can be excluded here)
binary_data = balanced_data[balanced_data['sentiment'].isin([1, 2])]  # Exclude neutral (3)

# Label 1 for Positive and 0 for Negative reviews
binary_data['label'] = binary_data['sentiment'].apply(lambda x: 1 if x == 1 else 0)

# Create a feature matrix with TF-IDF
X_tfidf = tfidf_vectorizer.transform(binary_data['review_body'])
y = binary_data['label']


In [28]:
sentences = [simple_preprocess(str(doc)) for doc in binary_data["review_body"]]
self_w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

In [29]:
def get_avg_w2v_features(text, model, is_pretrained=False, vector_size=100):
    tokens = simple_preprocess(str(text))
    vectors = []
    
    if is_pretrained:
        for word in tokens:
            if word in model:
                vectors.append(model[word])
    else:
        for word in tokens:
            if word in model.wv:
                vectors.append(model.wv[word])
    
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

In [33]:
from tqdm import tqdm
X_self_w2v = np.array([get_avg_w2v_features(text, self_w2v_model) for text in tqdm(binary_data['review_body'])])
X_pretrained_w2v = np.array([get_avg_w2v_features(text, wv, is_pretrained=True, vector_size=300) for text in tqdm(binary_data['review_body'])])

100%|██████████| 200000/200000 [00:22<00:00, 8722.79it/s] 
100%|██████████| 200000/200000 [00:23<00:00, 8463.57it/s] 


In [36]:
X_train_self, X_test_self, y_train, y_test = train_test_split(X_self_w2v, binary_data['label'], test_size=0.2, random_state=42)
X_train_pre, X_test_pre, _, _ = train_test_split(X_pretrained_w2v, binary_data['label'], test_size=0.2, random_state=42)
perceptron_self = Perceptron(max_iter=1000, random_state=42)
perceptron_self.fit(X_train_self, y_train)
accuracy_self = accuracy_score(y_test, perceptron_self.predict(X_test_self))
perceptron_pre = Perceptron(max_iter=1000, random_state=42)
perceptron_pre.fit(X_train_pre, y_train)
accuracy_pre = accuracy_score(y_test, perceptron_pre.predict(X_test_pre))
print(f"Accuracy (Self-trained Word2Vec): {accuracy_self:.4f}")
print(f"Accuracy (Pre-trained Word2Vec): {accuracy_pre:.4f}")

Accuracy (Self-trained Word2Vec): 0.7735
Accuracy (Pre-trained Word2Vec): 0.7435


In [42]:
import numpy as np
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm  # Progress tracking

# Load Pre-trained Word2Vec (Google News)
pretrained_w2v = api.load('word2vec-google-news-300')

# Filter only positive and negative reviews
binary_data = balanced_data[balanced_data['sentiment'].isin([1, 2])].copy()
binary_data['label'] = binary_data['sentiment'].apply(lambda x: 1 if x == 1 else 0)

# Prepare sentences for self-trained Word2Vec
sentences = [simple_preprocess(str(doc)) for doc in binary_data["review_body"]]
self_w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

# Function to get averaged word vectors
def get_avg_w2v_features(text, model, is_pretrained=False, vector_size=100):
    tokens = simple_preprocess(str(text))
    vectors = []
    
    if is_pretrained:
        for word in tokens:
            if word in model:
                vectors.append(model[word])
    else:
        for word in tokens:
            if word in model.wv:
                vectors.append(model.wv[word])
    
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

# Split dataset
pos_reviews = binary_data[binary_data['label'] == 1]['review_body']
neg_reviews = binary_data[binary_data['label'] == 0]['review_body']

pos_x_train, pos_x_test = train_test_split(pos_reviews, test_size=0.2, random_state=3)
neg_x_train, neg_x_test = train_test_split(neg_reviews, test_size=0.2, random_state=3)

X_train_12 = np.concatenate([pos_x_train, neg_x_train])
X_test_12 = np.concatenate([pos_x_test, neg_x_test])
y_train_12 = np.concatenate([np.ones(len(pos_x_train)), np.zeros(len(neg_x_train))])
y_test_12 = np.concatenate([np.ones(len(pos_x_test)), np.zeros(len(neg_x_test))])

# Convert dataset to Word2Vec embeddings
X_train_local = np.array([get_avg_w2v_features(text, self_w2v_model) for text in tqdm(X_train_12)])
X_test_local = np.array([get_avg_w2v_features(text, self_w2v_model) for text in tqdm(X_test_12)])

X_train_google = np.array([get_avg_w2v_features(text, pretrained_w2v, is_pretrained=True, vector_size=300) for text in tqdm(X_train_12)])
X_test_google = np.array([get_avg_w2v_features(text, pretrained_w2v, is_pretrained=True, vector_size=300) for text in tqdm(X_test_12)])

# Train and evaluate an SVM using self-trained Word2Vec embeddings
svm_local = LinearSVC(random_state=3)
svm_local.fit(X_train_local, y_train_12)
pred_svm_local = svm_local.predict(X_test_local)

print("Local W2V SVM Accuracy:", accuracy_score(y_test_12, pred_svm_local))

# Train and evaluate an SVM using Google News Word2Vec embeddings
svm_google = LinearSVC(random_state=3)
svm_google.fit(X_train_google, y_train_12)
pred_svm_google = svm_google.predict(X_test_google)

print("Google W2V SVM Accuracy:", accuracy_score(y_test_12, pred_svm_google))


100%|██████████| 160000/160000 [00:18<00:00, 8844.03it/s]
100%|██████████| 40000/40000 [00:04<00:00, 8841.56it/s] 
100%|██████████| 160000/160000 [00:18<00:00, 8641.06it/s]
100%|██████████| 40000/40000 [00:04<00:00, 8903.31it/s] 


Local W2V SVM Accuracy: 0.818875
Google W2V SVM Accuracy: 0.812025


# Self trained model slightly performs better as it has more relevent knowledge of the dataset. As you can clearly see above the results on self trained model is better than the pretrained model.

In [43]:
import torch
print(torch.__version__)  # This should print the installed version of PyTorch


2.0.1


In [29]:
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
import gensim.downloader as api
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# --------------------------
# Helper function to compute average Word2Vec vector
# --------------------------
def get_avg_vector(text, model):
    tokens = simple_preprocess(str(text))
    lookup = model.wv if hasattr(model, 'wv') else model
    vectors = [lookup[word] for word in tokens if word in lookup.key_to_index]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

# Load the pretrained Google News Word2Vec model
google_model = api.load("word2vec-google-news-300")

# Prepare feature matrices for each case using average Word2Vec vectors
binary_data = balanced_data[balanced_data['sentiment'].isin([1, 2])].copy()
binary_data['label'] = binary_data['sentiment'].apply(lambda x: 0 if x == 1 else 1)

ternary_data = balanced_data.copy()
ternary_data['label'] = ternary_data['sentiment'] - 1

def generate_features(df, model):
    return np.array([get_avg_vector(review, model) for review in df['review_body']])

X_bin_pre = generate_features(binary_data, google_model)
X_bin_self = generate_features(binary_data, w2v_model)
y_bin = binary_data['label'].values

X_tern_pre = generate_features(ternary_data, google_model)
X_tern_self = generate_features(ternary_data, w2v_model)
y_tern = ternary_data['label'].values

# Split the data into training and testing sets
X_train_bin_pre, X_test_bin_pre, y_train_bin, y_test_bin = train_test_split(X_bin_pre, y_bin, test_size=0.2, random_state=42)
X_train_bin_self, X_test_bin_self, _, _ = train_test_split(X_bin_self, y_bin, test_size=0.2, random_state=42)
X_train_tern_pre, X_test_tern_pre, y_train_tern, y_test_tern = train_test_split(X_tern_pre, y_tern, test_size=0.2, random_state=42)
X_train_tern_self, X_test_tern_self, _, _ = train_test_split(X_tern_self, y_tern, test_size=0.2, random_state=42)

# Convert numpy arrays to PyTorch tensors
def to_tensor(X, y=None):
    X_tensor = torch.FloatTensor(X).to(device)
    y_tensor = torch.LongTensor(y).to(device) if y is not None else None
    return X_tensor, y_tensor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train_bin_pre, y_train_bin = to_tensor(X_train_bin_pre, y_train_bin)
X_test_bin_pre, y_test_bin = to_tensor(X_test_bin_pre, y_test_bin)
X_train_bin_self, _ = to_tensor(X_train_bin_self)
X_test_bin_self, _ = to_tensor(X_test_bin_self)
X_train_tern_pre, y_train_tern = to_tensor(X_train_tern_pre, y_train_tern)
X_test_tern_pre, y_test_tern = to_tensor(X_test_tern_pre, y_test_tern)
X_train_tern_self, _ = to_tensor(X_train_tern_self)
X_test_tern_self, _ = to_tensor(X_test_tern_self)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_dim, hidden1, hidden2, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, output_dim)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Function to train the model and return its test accuracy
def train_model(model, X_train, y_train, X_test, y_test, epochs=100, lr=0.001):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
    
    # Evaluate on test set
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)
        test_acc = accuracy_score(y_test.cpu().numpy(), predicted.cpu().numpy())
    return test_acc

# Train and evaluate models
model_bin_pre = MLP(300, 50, 10, 2)
acc_bin_pre = train_model(model_bin_pre, X_train_bin_pre, y_train_bin, X_test_bin_pre, y_test_bin)

model_bin_self = MLP(300, 50, 10, 2)
acc_bin_self = train_model(model_bin_self, X_train_bin_self, y_train_bin, X_test_bin_self, y_test_bin)

model_tern_pre = MLP(300, 50, 10, 3)
acc_tern_pre = train_model(model_tern_pre, X_train_tern_pre, y_train_tern, X_test_tern_pre, y_test_tern)

model_tern_self = MLP(300, 50, 10, 3)
acc_tern_self = train_model(model_tern_self, X_train_tern_self, y_train_tern, X_test_tern_self, y_test_tern)

# Report the test accuracy for each model
print("Model outputs (Test Accuracy):")
print("1. Binary Classification using Pretrained embeddings: {:.4f}".format(acc_bin_pre))
print("2. Binary Classification using Self-trained embeddings: {:.4f}".format(acc_bin_self))
print("3. Ternary Classification using Pretrained embeddings: {:.4f}".format(acc_tern_pre))
print("4. Ternary Classification using Self-trained embeddings: {:.4f}".format(acc_tern_self))


Model outputs (Test Accuracy):
1. Binary Classification using Pretrained embeddings: 0.8010
2. Binary Classification using Self-trained embeddings: 0.8583
3. Ternary Classification using Pretrained embeddings: 0.6407
4. Ternary Classification using Self-trained embeddings: 0.6927


# Binary model performs better than that of the ternary as we can see a significant jump in the accuracies.
# Additionally, the self trained model performs better overall than the pretained model.


In [30]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

# --------------------------
# 1. Feature Extraction Function:
#    Concatenate the first 10 Word2Vec vectors for each review.
# --------------------------
def get_concat_vector(text, model, max_tokens=10):
    """
    Tokenizes the text using gensim's simple_preprocess and concatenates
    the first max_tokens word vectors from the provided Word2Vec model.
    If there are fewer than max_tokens tokens, pads with zero vectors.
    Returns a feature vector of dimension (max_tokens * embedding_dim), i.e. (10 * 300 = 3000).
    """
    tokens = simple_preprocess(str(text))
    lookup = model.wv if hasattr(model, 'wv') else model
    vectors = []
    for token in tokens[:max_tokens]:
        if token in lookup.key_to_index:
            vectors.append(lookup[token])
        else:
            vectors.append(np.zeros(lookup.vector_size))
    while len(vectors) < max_tokens:
        vectors.append(np.zeros(lookup.vector_size))
    vec = np.concatenate(vectors)
    # Check the resulting dimension is as expected.
    assert vec.shape[0] == max_tokens * lookup.vector_size, (
        f"Expected dimension {max_tokens * lookup.vector_size}, got {vec.shape[0]}"
    )
    return vec

def generate_concat_features(df, model):
    """
    Generates a feature matrix (n_samples x 3000) where each row is the concatenation
    of the first 10 Word2Vec vectors from df['review_body'].
    Uses tqdm to display progress.
    """
    features = []
    for review in tqdm(df['review_body'], desc="Generating concat features"):
        features.append(get_concat_vector(review, model))
    return np.array(features)

# --------------------------
# 2. Prepare the Binary Dataset
# --------------------------
# Filter reviews to only those with sentiment 1 or 2.
binary_data = balanced_data[balanced_data['sentiment'].isin([1, 2])].copy()
# Map sentiment 1 -> label 0 and sentiment 2 -> label 1.
binary_data['label'] = binary_data['sentiment'].apply(lambda x: 0 if x == 1 else 1)

# --------------------------
# 3. Load the Pretrained Google News Word2Vec Model
# --------------------------
google_model = api.load("word2vec-google-news-300")

# --------------------------
# 4. Generate Input Features using Pretrained Embeddings
# --------------------------
X_bin_pre = generate_concat_features(binary_data, google_model)
y_bin = binary_data['label'].values

# --------------------------
# 5. Split the Data (80/20 Train/Test)
# --------------------------
X_train_pre, X_test_pre, y_train, y_test = train_test_split(
    X_bin_pre, y_bin, test_size=0.2, random_state=42
)

# --------------------------
# 6. Convert Data to PyTorch Tensors
# --------------------------
def to_tensor(X, y=None):
    X_tensor = torch.FloatTensor(X)
    if y is not None:
        y_tensor = torch.LongTensor(y)
        return X_tensor, y_tensor
    return X_tensor

X_train_pre, y_train = to_tensor(X_train_pre, y_train)
X_test_pre, y_test = to_tensor(X_test_pre, y_test)

# --------------------------
# 7. Define the MLP Model Architecture
# --------------------------
# Input dimension is 3000 (10 tokens * 300 dims), two hidden layers (50 and 10 nodes) and output (2 classes).
class MLP(nn.Module):
    def __init__(self, input_dim, hidden1=50, hidden2=10, output_dim=2):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# --------------------------
# 8. Define the Training Function with Mini-Batching and tqdm
# --------------------------
def train_model(model, X_train, y_train, X_test, y_test, epochs=50, lr=0.001, batch_size=32):
    # Create a DataLoader for mini-batch training.
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in tqdm(range(epochs), desc="Training epochs"):
        model.train()
        epoch_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} -- Loss: {epoch_loss/len(train_loader):.4f}")
    
    # Evaluate the model on the test set.
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)
        test_acc = accuracy_score(y_test.numpy(), predicted.numpy())
    return test_acc

# --------------------------
# 9. Build, Train, and Evaluate the Binary Classification Model using Pretrained embeddings.
# --------------------------
input_dim = 3000  # 10 tokens * 300 dimensions
model_bin_pre = MLP(input_dim=input_dim, hidden1=50, hidden2=10, output_dim=2)
acc_bin_pre = train_model(model_bin_pre, X_train_pre, y_train, X_test_pre, y_test,
                          epochs=50, lr=0.001, batch_size=32)

print("Binary Classification using Pretrained embeddings Accuracy: {:.4f}".format(acc_bin_pre))


Generating concat features: 100%|██████████| 200000/200000 [00:12<00:00, 16616.42it/s]
Training epochs:   2%|▏         | 1/50 [00:05<04:08,  5.06s/it]

Epoch 1/50 -- Loss: 0.4802


Training epochs:   4%|▍         | 2/50 [00:09<03:54,  4.88s/it]

Epoch 2/50 -- Loss: 0.4239


Training epochs:   6%|▌         | 3/50 [00:14<03:53,  4.96s/it]

Epoch 3/50 -- Loss: 0.3852


Training epochs:   8%|▊         | 4/50 [00:19<03:46,  4.93s/it]

Epoch 4/50 -- Loss: 0.3478


Training epochs:  10%|█         | 5/50 [00:24<03:38,  4.86s/it]

Epoch 5/50 -- Loss: 0.3126


Training epochs:  12%|█▏        | 6/50 [00:29<03:32,  4.83s/it]

Epoch 6/50 -- Loss: 0.2799


Training epochs:  14%|█▍        | 7/50 [00:33<03:26,  4.80s/it]

Epoch 7/50 -- Loss: 0.2507


Training epochs:  16%|█▌        | 8/50 [00:38<03:22,  4.82s/it]

Epoch 8/50 -- Loss: 0.2249


Training epochs:  18%|█▊        | 9/50 [00:43<03:16,  4.78s/it]

Epoch 9/50 -- Loss: 0.2027


Training epochs:  20%|██        | 10/50 [00:48<03:10,  4.76s/it]

Epoch 10/50 -- Loss: 0.1831


Training epochs:  22%|██▏       | 11/50 [00:53<03:09,  4.85s/it]

Epoch 11/50 -- Loss: 0.1683


Training epochs:  24%|██▍       | 12/50 [00:58<03:05,  4.87s/it]

Epoch 12/50 -- Loss: 0.1528


Training epochs:  26%|██▌       | 13/50 [01:02<02:58,  4.83s/it]

Epoch 13/50 -- Loss: 0.1410


Training epochs:  28%|██▊       | 14/50 [01:08<02:59,  5.00s/it]

Epoch 14/50 -- Loss: 0.1296


Training epochs:  30%|███       | 15/50 [01:13<02:59,  5.13s/it]

Epoch 15/50 -- Loss: 0.1219


Training epochs:  32%|███▏      | 16/50 [01:19<03:01,  5.35s/it]

Epoch 16/50 -- Loss: 0.1136


Training epochs:  34%|███▍      | 17/50 [01:25<02:59,  5.44s/it]

Epoch 17/50 -- Loss: 0.1063


Training epochs:  36%|███▌      | 18/50 [01:30<02:54,  5.46s/it]

Epoch 18/50 -- Loss: 0.1004


Training epochs:  38%|███▊      | 19/50 [01:36<02:49,  5.48s/it]

Epoch 19/50 -- Loss: 0.0955


Training epochs:  40%|████      | 20/50 [01:42<02:47,  5.57s/it]

Epoch 20/50 -- Loss: 0.0904


Training epochs:  42%|████▏     | 21/50 [01:47<02:42,  5.62s/it]

Epoch 21/50 -- Loss: 0.0862


Training epochs:  44%|████▍     | 22/50 [01:53<02:36,  5.60s/it]

Epoch 22/50 -- Loss: 0.0814


Training epochs:  46%|████▌     | 23/50 [01:58<02:29,  5.55s/it]

Epoch 23/50 -- Loss: 0.0786


Training epochs:  48%|████▊     | 24/50 [02:04<02:24,  5.58s/it]

Epoch 24/50 -- Loss: 0.0737


Training epochs:  50%|█████     | 25/50 [02:10<02:21,  5.67s/it]

Epoch 25/50 -- Loss: 0.0718


Training epochs:  52%|█████▏    | 26/50 [02:15<02:15,  5.64s/it]

Epoch 26/50 -- Loss: 0.0687


Training epochs:  54%|█████▍    | 27/50 [02:21<02:08,  5.60s/it]

Epoch 27/50 -- Loss: 0.0674


Training epochs:  56%|█████▌    | 28/50 [02:27<02:02,  5.59s/it]

Epoch 28/50 -- Loss: 0.0625


Training epochs:  58%|█████▊    | 29/50 [02:32<01:58,  5.64s/it]

Epoch 29/50 -- Loss: 0.0622


Training epochs:  60%|██████    | 30/50 [02:38<01:53,  5.66s/it]

Epoch 30/50 -- Loss: 0.0598


Training epochs:  62%|██████▏   | 31/50 [02:44<01:46,  5.62s/it]

Epoch 31/50 -- Loss: 0.0585


Training epochs:  64%|██████▍   | 32/50 [02:49<01:40,  5.58s/it]

Epoch 32/50 -- Loss: 0.0550


Training epochs:  66%|██████▌   | 33/50 [02:55<01:35,  5.62s/it]

Epoch 33/50 -- Loss: 0.0546


Training epochs:  68%|██████▊   | 34/50 [03:01<01:30,  5.68s/it]

Epoch 34/50 -- Loss: 0.0527


Training epochs:  70%|███████   | 35/50 [03:06<01:24,  5.63s/it]

Epoch 35/50 -- Loss: 0.0521


Training epochs:  72%|███████▏  | 36/50 [03:12<01:18,  5.60s/it]

Epoch 36/50 -- Loss: 0.0485


Training epochs:  74%|███████▍  | 37/50 [03:17<01:12,  5.60s/it]

Epoch 37/50 -- Loss: 0.0487


Training epochs:  76%|███████▌  | 38/50 [03:23<01:07,  5.66s/it]

Epoch 38/50 -- Loss: 0.0472


Training epochs:  78%|███████▊  | 39/50 [03:29<01:02,  5.67s/it]

Epoch 39/50 -- Loss: 0.0454


Training epochs:  80%|████████  | 40/50 [03:34<00:56,  5.64s/it]

Epoch 40/50 -- Loss: 0.0443


Training epochs:  82%|████████▏ | 41/50 [03:40<00:50,  5.59s/it]

Epoch 41/50 -- Loss: 0.0436


Training epochs:  84%|████████▍ | 42/50 [03:45<00:45,  5.64s/it]

Epoch 42/50 -- Loss: 0.0441


Training epochs:  86%|████████▌ | 43/50 [03:51<00:39,  5.68s/it]

Epoch 43/50 -- Loss: 0.0418


Training epochs:  88%|████████▊ | 44/50 [03:57<00:33,  5.62s/it]

Epoch 44/50 -- Loss: 0.0406


Training epochs:  90%|█████████ | 45/50 [04:02<00:27,  5.60s/it]

Epoch 45/50 -- Loss: 0.0395


Training epochs:  92%|█████████▏| 46/50 [04:08<00:22,  5.60s/it]

Epoch 46/50 -- Loss: 0.0384


Training epochs:  94%|█████████▍| 47/50 [04:14<00:16,  5.67s/it]

Epoch 47/50 -- Loss: 0.0386


Training epochs:  96%|█████████▌| 48/50 [04:19<00:11,  5.65s/it]

Epoch 48/50 -- Loss: 0.0376


Training epochs:  98%|█████████▊| 49/50 [04:25<00:05,  5.62s/it]

Epoch 49/50 -- Loss: 0.0373


Training epochs: 100%|██████████| 50/50 [04:30<00:00,  5.42s/it]

Epoch 50/50 -- Loss: 0.0364





Binary Classification using Pretrained embeddings Accuracy: 0.7523


In [33]:
# For binary classification, only consider sentiment 1 and 2.
binary_data = balanced_data[balanced_data['sentiment'].isin([1, 2])].copy()
# Map sentiment 1 to label 0 and sentiment 2 to label 1.
binary_data['label'] = binary_data['sentiment'].apply(lambda x: 0 if x == 1 else 1)

# For ternary classification, use all classes 1, 2, and 3; remap to 0, 1, 2.
ternary_data = balanced_data.copy()
ternary_data['label'] = ternary_data['sentiment'] - 1

# Using average Word2Vec vectors (300-dim)
# Pretrained features:
X_bin_pre = generate_features(binary_data, google_model)  
# Self-trained features:
X_bin_self = generate_features(binary_data, w2v_model)  
y_bin = binary_data['label'].values

# For ternary data:
X_tern_pre = generate_features(ternary_data, google_model)
X_tern_self = generate_features(ternary_data, w2v_model)
y_tern = ternary_data['label'].values

# Binary splits:
X_train_bin_pre, X_test_bin_pre, y_train_bin, y_test_bin = train_test_split(X_bin_pre, y_bin, test_size=0.2, random_state=42)
X_train_bin_self, X_test_bin_self, _, _ = train_test_split(X_bin_self, y_bin, test_size=0.2, random_state=42)

# Ternary splits:
X_train_tern_pre, X_test_tern_pre, y_train_tern, y_test_tern = train_test_split(X_tern_pre, y_tern, test_size=0.2, random_state=42)
X_train_tern_self, X_test_tern_self, _, _ = train_test_split(X_tern_self, y_tern, test_size=0.2, random_state=42)

# Convert to tensors:
X_train_bin_pre, y_train_bin = to_tensor(X_train_bin_pre, y_train_bin)
X_test_bin_pre, y_test_bin   = to_tensor(X_test_bin_pre, y_test_bin)
X_train_bin_self = to_tensor(X_train_bin_self, None)
X_test_bin_self  = to_tensor(X_test_bin_self, None)


X_train_tern_pre, y_train_tern = to_tensor(X_train_tern_pre, y_train_tern)
X_test_tern_pre, y_test_tern   = to_tensor(X_test_tern_pre, y_test_tern)
X_train_tern_self = to_tensor(X_train_tern_self, None)
X_test_tern_self  = to_tensor(X_test_tern_self, None)


# Build the MLP for binary classification (self-trained embeddings)
# Input dimension remains 300 (since we're averaging the word vectors)
model_bin_self = MLP(input_dim=300, hidden1=50, hidden2=10, output_dim=2)
acc_bin_self = train_model(model_bin_self, X_train_bin_self, y_train_bin, X_test_bin_self, y_test_bin, epochs=100, lr=0.001)
print("Binary Classification using Self-trained embeddings Accuracy: {:.4f}".format(acc_bin_self))


Training epochs:   1%|          | 1/100 [00:03<05:47,  3.51s/it]

Epoch 1/100 -- Loss: 0.3326


Training epochs:   2%|▏         | 2/100 [00:07<05:45,  3.52s/it]

Epoch 2/100 -- Loss: 0.2956


Training epochs:   3%|▎         | 3/100 [00:10<05:42,  3.53s/it]

Epoch 3/100 -- Loss: 0.2826


Training epochs:   4%|▍         | 4/100 [00:14<05:40,  3.54s/it]

Epoch 4/100 -- Loss: 0.2745


Training epochs:   5%|▌         | 5/100 [00:17<05:37,  3.55s/it]

Epoch 5/100 -- Loss: 0.2681


Training epochs:   6%|▌         | 6/100 [00:21<05:33,  3.55s/it]

Epoch 6/100 -- Loss: 0.2631


Training epochs:   7%|▋         | 7/100 [00:24<05:31,  3.57s/it]

Epoch 7/100 -- Loss: 0.2587


Training epochs:   8%|▊         | 8/100 [00:28<05:27,  3.56s/it]

Epoch 8/100 -- Loss: 0.2545


Training epochs:   9%|▉         | 9/100 [00:31<05:23,  3.56s/it]

Epoch 9/100 -- Loss: 0.2510


Training epochs:  10%|█         | 10/100 [00:35<05:19,  3.55s/it]

Epoch 10/100 -- Loss: 0.2481


Training epochs:  11%|█         | 11/100 [00:38<05:14,  3.53s/it]

Epoch 11/100 -- Loss: 0.2453


Training epochs:  12%|█▏        | 12/100 [00:42<05:12,  3.55s/it]

Epoch 12/100 -- Loss: 0.2422


Training epochs:  13%|█▎        | 13/100 [00:46<05:09,  3.56s/it]

Epoch 13/100 -- Loss: 0.2398


Training epochs:  14%|█▍        | 14/100 [00:49<05:04,  3.54s/it]

Epoch 14/100 -- Loss: 0.2373


Training epochs:  15%|█▌        | 15/100 [00:53<05:01,  3.54s/it]

Epoch 15/100 -- Loss: 0.2362


Training epochs:  16%|█▌        | 16/100 [00:57<05:05,  3.64s/it]

Epoch 16/100 -- Loss: 0.2340


Training epochs:  17%|█▋        | 17/100 [01:00<05:02,  3.64s/it]

Epoch 17/100 -- Loss: 0.2318


Training epochs:  18%|█▊        | 18/100 [01:04<04:59,  3.65s/it]

Epoch 18/100 -- Loss: 0.2307


Training epochs:  19%|█▉        | 19/100 [01:08<04:55,  3.65s/it]

Epoch 19/100 -- Loss: 0.2288


Training epochs:  20%|██        | 20/100 [01:11<04:51,  3.65s/it]

Epoch 20/100 -- Loss: 0.2272


Training epochs:  21%|██        | 21/100 [01:15<04:47,  3.63s/it]

Epoch 21/100 -- Loss: 0.2259


Training epochs:  22%|██▏       | 22/100 [01:18<04:44,  3.65s/it]

Epoch 22/100 -- Loss: 0.2242


Training epochs:  23%|██▎       | 23/100 [01:22<04:39,  3.64s/it]

Epoch 23/100 -- Loss: 0.2235


Training epochs:  24%|██▍       | 24/100 [01:26<04:37,  3.65s/it]

Epoch 24/100 -- Loss: 0.2216


Training epochs:  25%|██▌       | 25/100 [01:29<04:30,  3.60s/it]

Epoch 25/100 -- Loss: 0.2207


Training epochs:  26%|██▌       | 26/100 [01:33<04:25,  3.59s/it]

Epoch 26/100 -- Loss: 0.2196


Training epochs:  27%|██▋       | 27/100 [01:36<04:22,  3.59s/it]

Epoch 27/100 -- Loss: 0.2185


Training epochs:  28%|██▊       | 28/100 [01:40<04:17,  3.57s/it]

Epoch 28/100 -- Loss: 0.2177


Training epochs:  29%|██▉       | 29/100 [01:43<04:12,  3.56s/it]

Epoch 29/100 -- Loss: 0.2162


Training epochs:  30%|███       | 30/100 [01:47<04:09,  3.56s/it]

Epoch 30/100 -- Loss: 0.2156


Training epochs:  31%|███       | 31/100 [01:51<04:04,  3.54s/it]

Epoch 31/100 -- Loss: 0.2145


Training epochs:  32%|███▏      | 32/100 [01:54<04:01,  3.54s/it]

Epoch 32/100 -- Loss: 0.2139


Training epochs:  33%|███▎      | 33/100 [01:58<03:58,  3.56s/it]

Epoch 33/100 -- Loss: 0.2129


Training epochs:  34%|███▍      | 34/100 [02:01<03:53,  3.54s/it]

Epoch 34/100 -- Loss: 0.2123


Training epochs:  35%|███▌      | 35/100 [02:05<03:50,  3.55s/it]

Epoch 35/100 -- Loss: 0.2114


Training epochs:  36%|███▌      | 36/100 [02:08<03:46,  3.54s/it]

Epoch 36/100 -- Loss: 0.2103


Training epochs:  37%|███▋      | 37/100 [02:12<03:41,  3.52s/it]

Epoch 37/100 -- Loss: 0.2095


Training epochs:  38%|███▊      | 38/100 [02:15<03:39,  3.53s/it]

Epoch 38/100 -- Loss: 0.2090


Training epochs:  39%|███▉      | 39/100 [02:19<03:36,  3.55s/it]

Epoch 39/100 -- Loss: 0.2080


Training epochs:  40%|████      | 40/100 [02:22<03:32,  3.53s/it]

Epoch 40/100 -- Loss: 0.2069


Training epochs:  41%|████      | 41/100 [02:26<03:28,  3.54s/it]

Epoch 41/100 -- Loss: 0.2064


Training epochs:  42%|████▏     | 42/100 [02:29<03:25,  3.54s/it]

Epoch 42/100 -- Loss: 0.2053


Training epochs:  43%|████▎     | 43/100 [02:33<03:20,  3.52s/it]

Epoch 43/100 -- Loss: 0.2051


Training epochs:  44%|████▍     | 44/100 [02:37<03:18,  3.55s/it]

Epoch 44/100 -- Loss: 0.2044


Training epochs:  45%|████▌     | 45/100 [02:40<03:15,  3.55s/it]

Epoch 45/100 -- Loss: 0.2037


Training epochs:  46%|████▌     | 46/100 [02:44<03:10,  3.53s/it]

Epoch 46/100 -- Loss: 0.2028


Training epochs:  47%|████▋     | 47/100 [02:47<03:07,  3.53s/it]

Epoch 47/100 -- Loss: 0.2023


Training epochs:  48%|████▊     | 48/100 [02:51<03:03,  3.53s/it]

Epoch 48/100 -- Loss: 0.2019


Training epochs:  49%|████▉     | 49/100 [02:54<02:59,  3.53s/it]

Epoch 49/100 -- Loss: 0.2013


Training epochs:  50%|█████     | 50/100 [02:58<02:57,  3.55s/it]

Epoch 50/100 -- Loss: 0.2004


Training epochs:  51%|█████     | 51/100 [03:01<02:54,  3.56s/it]

Epoch 51/100 -- Loss: 0.2001


Training epochs:  52%|█████▏    | 52/100 [03:05<02:49,  3.54s/it]

Epoch 52/100 -- Loss: 0.1996


Training epochs:  53%|█████▎    | 53/100 [03:08<02:46,  3.54s/it]

Epoch 53/100 -- Loss: 0.1989


Training epochs:  54%|█████▍    | 54/100 [03:12<02:42,  3.54s/it]

Epoch 54/100 -- Loss: 0.1987


Training epochs:  55%|█████▌    | 55/100 [03:15<02:39,  3.54s/it]

Epoch 55/100 -- Loss: 0.1980


Training epochs:  56%|█████▌    | 56/100 [03:19<02:35,  3.54s/it]

Epoch 56/100 -- Loss: 0.1973


Training epochs:  57%|█████▋    | 57/100 [03:23<02:32,  3.54s/it]

Epoch 57/100 -- Loss: 0.1969


Training epochs:  58%|█████▊    | 58/100 [03:26<02:27,  3.52s/it]

Epoch 58/100 -- Loss: 0.1963


Training epochs:  59%|█████▉    | 59/100 [03:30<02:24,  3.53s/it]

Epoch 59/100 -- Loss: 0.1958


Training epochs:  60%|██████    | 60/100 [03:33<02:21,  3.53s/it]

Epoch 60/100 -- Loss: 0.1958


Training epochs:  61%|██████    | 61/100 [03:37<02:18,  3.54s/it]

Epoch 61/100 -- Loss: 0.1947


Training epochs:  62%|██████▏   | 62/100 [03:40<02:14,  3.54s/it]

Epoch 62/100 -- Loss: 0.1948


Training epochs:  63%|██████▎   | 63/100 [03:44<02:11,  3.54s/it]

Epoch 63/100 -- Loss: 0.1946


Training epochs:  64%|██████▍   | 64/100 [03:47<02:07,  3.53s/it]

Epoch 64/100 -- Loss: 0.1935


Training epochs:  65%|██████▌   | 65/100 [03:51<02:03,  3.53s/it]

Epoch 65/100 -- Loss: 0.1934


Training epochs:  66%|██████▌   | 66/100 [03:54<02:00,  3.55s/it]

Epoch 66/100 -- Loss: 0.1929


Training epochs:  67%|██████▋   | 67/100 [03:58<01:56,  3.54s/it]

Epoch 67/100 -- Loss: 0.1924


Training epochs:  68%|██████▊   | 68/100 [04:01<01:53,  3.55s/it]

Epoch 68/100 -- Loss: 0.1920


Training epochs:  69%|██████▉   | 69/100 [04:05<01:49,  3.55s/it]

Epoch 69/100 -- Loss: 0.1916


Training epochs:  70%|███████   | 70/100 [04:09<01:46,  3.54s/it]

Epoch 70/100 -- Loss: 0.1911


Training epochs:  71%|███████   | 71/100 [04:12<01:42,  3.54s/it]

Epoch 71/100 -- Loss: 0.1906


Training epochs:  72%|███████▏  | 72/100 [04:16<01:39,  3.55s/it]

Epoch 72/100 -- Loss: 0.1909


Training epochs:  73%|███████▎  | 73/100 [04:19<01:36,  3.56s/it]

Epoch 73/100 -- Loss: 0.1904


Training epochs:  74%|███████▍  | 74/100 [04:23<01:32,  3.56s/it]

Epoch 74/100 -- Loss: 0.1895


Training epochs:  75%|███████▌  | 75/100 [04:26<01:28,  3.55s/it]

Epoch 75/100 -- Loss: 0.1893


Training epochs:  76%|███████▌  | 76/100 [04:30<01:25,  3.55s/it]

Epoch 76/100 -- Loss: 0.1888


Training epochs:  77%|███████▋  | 77/100 [04:33<01:21,  3.55s/it]

Epoch 77/100 -- Loss: 0.1886


Training epochs:  78%|███████▊  | 78/100 [04:37<01:18,  3.56s/it]

Epoch 78/100 -- Loss: 0.1885


Training epochs:  79%|███████▉  | 79/100 [04:41<01:14,  3.56s/it]

Epoch 79/100 -- Loss: 0.1879


Training epochs:  80%|████████  | 80/100 [04:44<01:10,  3.54s/it]

Epoch 80/100 -- Loss: 0.1879


Training epochs:  81%|████████  | 81/100 [04:48<01:07,  3.55s/it]

Epoch 81/100 -- Loss: 0.1870


Training epochs:  82%|████████▏ | 82/100 [04:51<01:03,  3.55s/it]

Epoch 82/100 -- Loss: 0.1868


Training epochs:  83%|████████▎ | 83/100 [04:55<01:00,  3.54s/it]

Epoch 83/100 -- Loss: 0.1871


Training epochs:  84%|████████▍ | 84/100 [04:58<00:56,  3.54s/it]

Epoch 84/100 -- Loss: 0.1860


Training epochs:  85%|████████▌ | 85/100 [05:02<00:53,  3.55s/it]

Epoch 85/100 -- Loss: 0.1858


Training epochs:  86%|████████▌ | 86/100 [05:05<00:49,  3.53s/it]

Epoch 86/100 -- Loss: 0.1854


Training epochs:  87%|████████▋ | 87/100 [05:09<00:46,  3.54s/it]

Epoch 87/100 -- Loss: 0.1851


Training epochs:  88%|████████▊ | 88/100 [05:12<00:42,  3.56s/it]

Epoch 88/100 -- Loss: 0.1845


Training epochs:  89%|████████▉ | 89/100 [05:16<00:39,  3.55s/it]

Epoch 89/100 -- Loss: 0.1845


Training epochs:  90%|█████████ | 90/100 [05:20<00:35,  3.55s/it]

Epoch 90/100 -- Loss: 0.1844


Training epochs:  91%|█████████ | 91/100 [05:23<00:31,  3.55s/it]

Epoch 91/100 -- Loss: 0.1841


Training epochs:  92%|█████████▏| 92/100 [05:27<00:28,  3.54s/it]

Epoch 92/100 -- Loss: 0.1840


Training epochs:  93%|█████████▎| 93/100 [05:30<00:24,  3.56s/it]

Epoch 93/100 -- Loss: 0.1830


Training epochs:  94%|█████████▍| 94/100 [05:34<00:21,  3.57s/it]

Epoch 94/100 -- Loss: 0.1831


Training epochs:  95%|█████████▌| 95/100 [05:37<00:17,  3.55s/it]

Epoch 95/100 -- Loss: 0.1834


Training epochs:  96%|█████████▌| 96/100 [05:41<00:14,  3.55s/it]

Epoch 96/100 -- Loss: 0.1825


Training epochs:  97%|█████████▋| 97/100 [05:44<00:10,  3.56s/it]

Epoch 97/100 -- Loss: 0.1823


Training epochs:  98%|█████████▊| 98/100 [05:48<00:07,  3.54s/it]

Epoch 98/100 -- Loss: 0.1823


Training epochs:  99%|█████████▉| 99/100 [05:52<00:03,  3.56s/it]

Epoch 99/100 -- Loss: 0.1818


Training epochs: 100%|██████████| 100/100 [05:55<00:00,  3.56s/it]

Epoch 100/100 -- Loss: 0.1814
Binary Classification using Self-trained embeddings Accuracy: 0.8758





In [34]:
# Build the MLP for ternary classification (pretrained embeddings)
model_tern_pre = MLP(input_dim=300, hidden1=50, hidden2=10, output_dim=3)
acc_tern_pre = train_model(model_tern_pre, X_train_tern_pre, y_train_tern, X_test_tern_pre, y_test_tern, epochs=100, lr=0.001)
print("Ternary Classification using Pretrained embeddings Accuracy: {:.4f}".format(acc_tern_pre))


Training epochs:   1%|          | 1/100 [00:04<07:27,  4.52s/it]

Epoch 1/100 -- Loss: 0.7735


Training epochs:   2%|▏         | 2/100 [00:08<07:16,  4.45s/it]

Epoch 2/100 -- Loss: 0.7318


Training epochs:   3%|▎         | 3/100 [00:13<07:13,  4.47s/it]

Epoch 3/100 -- Loss: 0.7190


Training epochs:   4%|▍         | 4/100 [00:17<07:09,  4.47s/it]

Epoch 4/100 -- Loss: 0.7098


Training epochs:   5%|▌         | 5/100 [00:22<07:02,  4.45s/it]

Epoch 5/100 -- Loss: 0.7035


Training epochs:   6%|▌         | 6/100 [00:26<06:58,  4.46s/it]

Epoch 6/100 -- Loss: 0.6981


Training epochs:   7%|▋         | 7/100 [00:31<06:53,  4.44s/it]

Epoch 7/100 -- Loss: 0.6928


Training epochs:   8%|▊         | 8/100 [00:35<06:49,  4.45s/it]

Epoch 8/100 -- Loss: 0.6889


Training epochs:   9%|▉         | 9/100 [00:40<06:44,  4.44s/it]

Epoch 9/100 -- Loss: 0.6855


Training epochs:  10%|█         | 10/100 [00:44<06:38,  4.43s/it]

Epoch 10/100 -- Loss: 0.6818


Training epochs:  11%|█         | 11/100 [00:48<06:35,  4.44s/it]

Epoch 11/100 -- Loss: 0.6791


Training epochs:  12%|█▏        | 12/100 [00:53<06:33,  4.48s/it]

Epoch 12/100 -- Loss: 0.6767


Training epochs:  13%|█▎        | 13/100 [00:58<06:30,  4.49s/it]

Epoch 13/100 -- Loss: 0.6733


Training epochs:  14%|█▍        | 14/100 [01:02<06:26,  4.49s/it]

Epoch 14/100 -- Loss: 0.6710


Training epochs:  15%|█▌        | 15/100 [01:07<06:23,  4.51s/it]

Epoch 15/100 -- Loss: 0.6691


Training epochs:  16%|█▌        | 16/100 [01:11<06:19,  4.51s/it]

Epoch 16/100 -- Loss: 0.6677


Training epochs:  17%|█▋        | 17/100 [01:16<06:16,  4.53s/it]

Epoch 17/100 -- Loss: 0.6658


Training epochs:  18%|█▊        | 18/100 [01:20<06:13,  4.55s/it]

Epoch 18/100 -- Loss: 0.6638


Training epochs:  19%|█▉        | 19/100 [01:25<06:08,  4.55s/it]

Epoch 19/100 -- Loss: 0.6618


Training epochs:  20%|██        | 20/100 [01:29<06:02,  4.54s/it]

Epoch 20/100 -- Loss: 0.6607


Training epochs:  21%|██        | 21/100 [01:34<06:00,  4.56s/it]

Epoch 21/100 -- Loss: 0.6590


Training epochs:  22%|██▏       | 22/100 [01:38<05:55,  4.56s/it]

Epoch 22/100 -- Loss: 0.6578


Training epochs:  23%|██▎       | 23/100 [01:43<05:50,  4.55s/it]

Epoch 23/100 -- Loss: 0.6557


Training epochs:  24%|██▍       | 24/100 [01:48<05:45,  4.54s/it]

Epoch 24/100 -- Loss: 0.6550


Training epochs:  25%|██▌       | 25/100 [01:52<05:39,  4.53s/it]

Epoch 25/100 -- Loss: 0.6534


Training epochs:  26%|██▌       | 26/100 [01:57<05:36,  4.55s/it]

Epoch 26/100 -- Loss: 0.6520


Training epochs:  27%|██▋       | 27/100 [02:01<05:31,  4.54s/it]

Epoch 27/100 -- Loss: 0.6510


Training epochs:  28%|██▊       | 28/100 [02:06<05:27,  4.55s/it]

Epoch 28/100 -- Loss: 0.6500


Training epochs:  29%|██▉       | 29/100 [02:10<05:22,  4.54s/it]

Epoch 29/100 -- Loss: 0.6487


Training epochs:  30%|███       | 30/100 [02:15<05:16,  4.52s/it]

Epoch 30/100 -- Loss: 0.6481


Training epochs:  31%|███       | 31/100 [02:19<05:12,  4.53s/it]

Epoch 31/100 -- Loss: 0.6471


Training epochs:  32%|███▏      | 32/100 [02:24<05:07,  4.52s/it]

Epoch 32/100 -- Loss: 0.6458


Training epochs:  33%|███▎      | 33/100 [02:28<05:03,  4.52s/it]

Epoch 33/100 -- Loss: 0.6454


Training epochs:  34%|███▍      | 34/100 [02:33<04:57,  4.51s/it]

Epoch 34/100 -- Loss: 0.6447


Training epochs:  35%|███▌      | 35/100 [02:37<04:55,  4.55s/it]

Epoch 35/100 -- Loss: 0.6436


Training epochs:  36%|███▌      | 36/100 [02:42<04:50,  4.54s/it]

Epoch 36/100 -- Loss: 0.6426


Training epochs:  37%|███▋      | 37/100 [02:46<04:45,  4.53s/it]

Epoch 37/100 -- Loss: 0.6417


Training epochs:  38%|███▊      | 38/100 [02:51<04:40,  4.53s/it]

Epoch 38/100 -- Loss: 0.6415


Training epochs:  39%|███▉      | 39/100 [02:56<04:36,  4.53s/it]

Epoch 39/100 -- Loss: 0.6404


Training epochs:  40%|████      | 40/100 [03:00<04:32,  4.54s/it]

Epoch 40/100 -- Loss: 0.6395


Training epochs:  41%|████      | 41/100 [03:05<04:27,  4.53s/it]

Epoch 41/100 -- Loss: 0.6386


Training epochs:  42%|████▏     | 42/100 [03:09<04:23,  4.54s/it]

Epoch 42/100 -- Loss: 0.6382


Training epochs:  43%|████▎     | 43/100 [03:14<04:18,  4.53s/it]

Epoch 43/100 -- Loss: 0.6376


Training epochs:  44%|████▍     | 44/100 [03:18<04:13,  4.52s/it]

Epoch 44/100 -- Loss: 0.6366


Training epochs:  45%|████▌     | 45/100 [03:23<04:08,  4.53s/it]

Epoch 45/100 -- Loss: 0.6362


Training epochs:  46%|████▌     | 46/100 [03:27<04:04,  4.53s/it]

Epoch 46/100 -- Loss: 0.6354


Training epochs:  47%|████▋     | 47/100 [03:32<03:59,  4.52s/it]

Epoch 47/100 -- Loss: 0.6350


Training epochs:  48%|████▊     | 48/100 [03:36<03:55,  4.52s/it]

Epoch 48/100 -- Loss: 0.6349


Training epochs:  49%|████▉     | 49/100 [03:41<03:51,  4.54s/it]

Epoch 49/100 -- Loss: 0.6338


Training epochs:  50%|█████     | 50/100 [03:45<03:46,  4.54s/it]

Epoch 50/100 -- Loss: 0.6337


Training epochs:  51%|█████     | 51/100 [03:50<03:41,  4.52s/it]

Epoch 51/100 -- Loss: 0.6328


Training epochs:  52%|█████▏    | 52/100 [03:54<03:36,  4.52s/it]

Epoch 52/100 -- Loss: 0.6326


Training epochs:  53%|█████▎    | 53/100 [03:59<03:32,  4.51s/it]

Epoch 53/100 -- Loss: 0.6317


Training epochs:  54%|█████▍    | 54/100 [04:03<03:27,  4.52s/it]

Epoch 54/100 -- Loss: 0.6314


Training epochs:  55%|█████▌    | 55/100 [04:08<03:23,  4.53s/it]

Epoch 55/100 -- Loss: 0.6312


Training epochs:  56%|█████▌    | 56/100 [04:13<03:20,  4.55s/it]

Epoch 56/100 -- Loss: 0.6302


Training epochs:  57%|█████▋    | 57/100 [04:17<03:14,  4.53s/it]

Epoch 57/100 -- Loss: 0.6298


Training epochs:  58%|█████▊    | 58/100 [04:22<03:10,  4.53s/it]

Epoch 58/100 -- Loss: 0.6289


Training epochs:  59%|█████▉    | 59/100 [04:26<03:05,  4.52s/it]

Epoch 59/100 -- Loss: 0.6287


Training epochs:  60%|██████    | 60/100 [04:31<03:01,  4.53s/it]

Epoch 60/100 -- Loss: 0.6282


Training epochs:  61%|██████    | 61/100 [04:35<02:56,  4.52s/it]

Epoch 61/100 -- Loss: 0.6283


Training epochs:  62%|██████▏   | 62/100 [04:40<02:51,  4.51s/it]

Epoch 62/100 -- Loss: 0.6274


Training epochs:  63%|██████▎   | 63/100 [04:44<02:47,  4.52s/it]

Epoch 63/100 -- Loss: 0.6266


Training epochs:  64%|██████▍   | 64/100 [04:49<02:43,  4.53s/it]

Epoch 64/100 -- Loss: 0.6265


Training epochs:  65%|██████▌   | 65/100 [04:53<02:38,  4.53s/it]

Epoch 65/100 -- Loss: 0.6259


Training epochs:  66%|██████▌   | 66/100 [04:58<02:33,  4.52s/it]

Epoch 66/100 -- Loss: 0.6262


Training epochs:  67%|██████▋   | 67/100 [05:02<02:28,  4.51s/it]

Epoch 67/100 -- Loss: 0.6255


Training epochs:  68%|██████▊   | 68/100 [05:07<02:24,  4.51s/it]

Epoch 68/100 -- Loss: 0.6251


Training epochs:  69%|██████▉   | 69/100 [05:11<02:19,  4.50s/it]

Epoch 69/100 -- Loss: 0.6250


Training epochs:  70%|███████   | 70/100 [05:16<02:15,  4.51s/it]

Epoch 70/100 -- Loss: 0.6243


Training epochs:  71%|███████   | 71/100 [05:20<02:11,  4.53s/it]

Epoch 71/100 -- Loss: 0.6240


Training epochs:  72%|███████▏  | 72/100 [05:25<02:06,  4.52s/it]

Epoch 72/100 -- Loss: 0.6237


Training epochs:  73%|███████▎  | 73/100 [05:29<02:01,  4.51s/it]

Epoch 73/100 -- Loss: 0.6234


Training epochs:  74%|███████▍  | 74/100 [05:34<01:57,  4.50s/it]

Epoch 74/100 -- Loss: 0.6231


Training epochs:  75%|███████▌  | 75/100 [05:38<01:52,  4.52s/it]

Epoch 75/100 -- Loss: 0.6225


Training epochs:  76%|███████▌  | 76/100 [05:43<01:48,  4.51s/it]

Epoch 76/100 -- Loss: 0.6226


Training epochs:  77%|███████▋  | 77/100 [05:47<01:43,  4.51s/it]

Epoch 77/100 -- Loss: 0.6220


Training epochs:  78%|███████▊  | 78/100 [05:52<01:39,  4.53s/it]

Epoch 78/100 -- Loss: 0.6218


Training epochs:  79%|███████▉  | 79/100 [05:56<01:34,  4.52s/it]

Epoch 79/100 -- Loss: 0.6213


Training epochs:  80%|████████  | 80/100 [06:01<01:30,  4.52s/it]

Epoch 80/100 -- Loss: 0.6212


Training epochs:  81%|████████  | 81/100 [06:05<01:25,  4.52s/it]

Epoch 81/100 -- Loss: 0.6209


Training epochs:  82%|████████▏ | 82/100 [06:10<01:21,  4.52s/it]

Epoch 82/100 -- Loss: 0.6208


Training epochs:  83%|████████▎ | 83/100 [06:14<01:16,  4.51s/it]

Epoch 83/100 -- Loss: 0.6202


Training epochs:  84%|████████▍ | 84/100 [06:19<01:12,  4.51s/it]

Epoch 84/100 -- Loss: 0.6197


Training epochs:  85%|████████▌ | 85/100 [06:24<01:07,  4.53s/it]

Epoch 85/100 -- Loss: 0.6193


Training epochs:  86%|████████▌ | 86/100 [06:28<01:03,  4.52s/it]

Epoch 86/100 -- Loss: 0.6196


Training epochs:  87%|████████▋ | 87/100 [06:33<00:58,  4.52s/it]

Epoch 87/100 -- Loss: 0.6189


Training epochs:  88%|████████▊ | 88/100 [06:37<00:54,  4.51s/it]

Epoch 88/100 -- Loss: 0.6189


Training epochs:  89%|████████▉ | 89/100 [06:42<00:49,  4.51s/it]

Epoch 89/100 -- Loss: 0.6182


Training epochs:  90%|█████████ | 90/100 [06:46<00:45,  4.51s/it]

Epoch 90/100 -- Loss: 0.6181


Training epochs:  91%|█████████ | 91/100 [06:51<00:40,  4.52s/it]

Epoch 91/100 -- Loss: 0.6180


Training epochs:  92%|█████████▏| 92/100 [06:55<00:36,  4.51s/it]

Epoch 92/100 -- Loss: 0.6178


Training epochs:  93%|█████████▎| 93/100 [07:00<00:31,  4.52s/it]

Epoch 93/100 -- Loss: 0.6174


Training epochs:  94%|█████████▍| 94/100 [07:04<00:27,  4.52s/it]

Epoch 94/100 -- Loss: 0.6168


Training epochs:  95%|█████████▌| 95/100 [07:09<00:22,  4.52s/it]

Epoch 95/100 -- Loss: 0.6168


Training epochs:  96%|█████████▌| 96/100 [07:13<00:18,  4.51s/it]

Epoch 96/100 -- Loss: 0.6161


Training epochs:  97%|█████████▋| 97/100 [07:18<00:13,  4.51s/it]

Epoch 97/100 -- Loss: 0.6163


Training epochs:  98%|█████████▊| 98/100 [07:22<00:09,  4.52s/it]

Epoch 98/100 -- Loss: 0.6160


Training epochs:  99%|█████████▉| 99/100 [07:27<00:04,  4.51s/it]

Epoch 99/100 -- Loss: 0.6156


Training epochs: 100%|██████████| 100/100 [07:31<00:00,  4.52s/it]

Epoch 100/100 -- Loss: 0.6156
Ternary Classification using Pretrained embeddings Accuracy: 0.7003





In [35]:
# Build the MLP for ternary classification (self-trained embeddings)
model_tern_self = MLP(input_dim=300, hidden1=50, hidden2=10, output_dim=3)
acc_tern_self = train_model(model_tern_self, X_train_tern_self, y_train_tern, X_test_tern_self, y_test_tern, epochs=100, lr=0.001)
print("Ternary Classification using Self-trained embeddings Accuracy: {:.4f}".format(acc_tern_self))


Training epochs:   1%|          | 1/100 [00:04<07:22,  4.47s/it]

Epoch 1/100 -- Loss: 0.7083


Training epochs:   2%|▏         | 2/100 [00:08<07:16,  4.45s/it]

Epoch 2/100 -- Loss: 0.6722


Training epochs:   3%|▎         | 3/100 [00:13<07:11,  4.45s/it]

Epoch 3/100 -- Loss: 0.6579


Training epochs:   4%|▍         | 4/100 [00:17<07:06,  4.45s/it]

Epoch 4/100 -- Loss: 0.6497


Training epochs:   5%|▌         | 5/100 [00:22<07:02,  4.45s/it]

Epoch 5/100 -- Loss: 0.6423


Training epochs:   6%|▌         | 6/100 [00:26<06:57,  4.44s/it]

Epoch 6/100 -- Loss: 0.6372


Training epochs:   7%|▋         | 7/100 [00:31<06:53,  4.44s/it]

Epoch 7/100 -- Loss: 0.6327


Training epochs:   8%|▊         | 8/100 [00:35<06:48,  4.44s/it]

Epoch 8/100 -- Loss: 0.6294


Training epochs:   9%|▉         | 9/100 [00:40<06:45,  4.46s/it]

Epoch 9/100 -- Loss: 0.6254


Training epochs:  10%|█         | 10/100 [00:44<06:41,  4.46s/it]

Epoch 10/100 -- Loss: 0.6226


Training epochs:  11%|█         | 11/100 [00:48<06:36,  4.46s/it]

Epoch 11/100 -- Loss: 0.6200


Training epochs:  12%|█▏        | 12/100 [00:53<06:31,  4.45s/it]

Epoch 12/100 -- Loss: 0.6176


Training epochs:  13%|█▎        | 13/100 [00:57<06:28,  4.47s/it]

Epoch 13/100 -- Loss: 0.6157


Training epochs:  14%|█▍        | 14/100 [01:02<06:23,  4.46s/it]

Epoch 14/100 -- Loss: 0.6138


Training epochs:  15%|█▌        | 15/100 [01:06<06:18,  4.45s/it]

Epoch 15/100 -- Loss: 0.6116


Training epochs:  16%|█▌        | 16/100 [01:11<06:13,  4.44s/it]

Epoch 16/100 -- Loss: 0.6097


Training epochs:  17%|█▋        | 17/100 [01:15<06:08,  4.44s/it]

Epoch 17/100 -- Loss: 0.6077


Training epochs:  18%|█▊        | 18/100 [01:20<06:04,  4.44s/it]

Epoch 18/100 -- Loss: 0.6066


Training epochs:  19%|█▉        | 19/100 [01:24<06:00,  4.45s/it]

Epoch 19/100 -- Loss: 0.6052


Training epochs:  20%|██        | 20/100 [01:28<05:55,  4.44s/it]

Epoch 20/100 -- Loss: 0.6038


Training epochs:  21%|██        | 21/100 [01:33<05:51,  4.45s/it]

Epoch 21/100 -- Loss: 0.6023


Training epochs:  22%|██▏       | 22/100 [01:37<05:47,  4.46s/it]

Epoch 22/100 -- Loss: 0.6009


Training epochs:  23%|██▎       | 23/100 [01:42<05:43,  4.46s/it]

Epoch 23/100 -- Loss: 0.5999


Training epochs:  24%|██▍       | 24/100 [01:46<05:38,  4.45s/it]

Epoch 24/100 -- Loss: 0.5987


Training epochs:  25%|██▌       | 25/100 [01:51<05:33,  4.45s/it]

Epoch 25/100 -- Loss: 0.5974


Training epochs:  26%|██▌       | 26/100 [01:55<05:29,  4.45s/it]

Epoch 26/100 -- Loss: 0.5968


Training epochs:  27%|██▋       | 27/100 [02:00<05:24,  4.45s/it]

Epoch 27/100 -- Loss: 0.5961


Training epochs:  28%|██▊       | 28/100 [02:04<05:19,  4.44s/it]

Epoch 28/100 -- Loss: 0.5950


Training epochs:  29%|██▉       | 29/100 [02:09<05:16,  4.45s/it]

Epoch 29/100 -- Loss: 0.5939


Training epochs:  30%|███       | 30/100 [02:13<05:11,  4.45s/it]

Epoch 30/100 -- Loss: 0.5935


Training epochs:  31%|███       | 31/100 [02:17<05:06,  4.44s/it]

Epoch 31/100 -- Loss: 0.5921


Training epochs:  32%|███▏      | 32/100 [02:22<05:02,  4.44s/it]

Epoch 32/100 -- Loss: 0.5918


Training epochs:  33%|███▎      | 33/100 [02:26<04:57,  4.44s/it]

Epoch 33/100 -- Loss: 0.5914


Training epochs:  34%|███▍      | 34/100 [02:31<04:54,  4.46s/it]

Epoch 34/100 -- Loss: 0.5902


Training epochs:  35%|███▌      | 35/100 [02:35<04:49,  4.46s/it]

Epoch 35/100 -- Loss: 0.5898


Training epochs:  36%|███▌      | 36/100 [02:40<04:44,  4.45s/it]

Epoch 36/100 -- Loss: 0.5890


Training epochs:  37%|███▋      | 37/100 [02:44<04:41,  4.46s/it]

Epoch 37/100 -- Loss: 0.5883


Training epochs:  38%|███▊      | 38/100 [02:49<04:36,  4.45s/it]

Epoch 38/100 -- Loss: 0.5879


Training epochs:  39%|███▉      | 39/100 [02:53<04:31,  4.45s/it]

Epoch 39/100 -- Loss: 0.5870


Training epochs:  40%|████      | 40/100 [02:57<04:26,  4.44s/it]

Epoch 40/100 -- Loss: 0.5870


Training epochs:  41%|████      | 41/100 [03:02<04:21,  4.43s/it]

Epoch 41/100 -- Loss: 0.5864


Training epochs:  42%|████▏     | 42/100 [03:06<04:16,  4.43s/it]

Epoch 42/100 -- Loss: 0.5853


Training epochs:  43%|████▎     | 43/100 [03:11<04:12,  4.43s/it]

Epoch 43/100 -- Loss: 0.5856


Training epochs:  44%|████▍     | 44/100 [03:15<04:07,  4.42s/it]

Epoch 44/100 -- Loss: 0.5844


Training epochs:  45%|████▌     | 45/100 [03:20<04:03,  4.42s/it]

Epoch 45/100 -- Loss: 0.5843


Training epochs:  46%|████▌     | 46/100 [03:24<03:59,  4.43s/it]

Epoch 46/100 -- Loss: 0.5835


Training epochs:  47%|████▋     | 47/100 [03:28<03:54,  4.43s/it]

Epoch 47/100 -- Loss: 0.5831


Training epochs:  48%|████▊     | 48/100 [03:33<03:50,  4.43s/it]

Epoch 48/100 -- Loss: 0.5828


Training epochs:  49%|████▉     | 49/100 [03:37<03:46,  4.45s/it]

Epoch 49/100 -- Loss: 0.5821


Training epochs:  50%|█████     | 50/100 [03:42<03:42,  4.45s/it]

Epoch 50/100 -- Loss: 0.5818


Training epochs:  51%|█████     | 51/100 [03:46<03:37,  4.44s/it]

Epoch 51/100 -- Loss: 0.5812


Training epochs:  52%|█████▏    | 52/100 [03:51<03:32,  4.43s/it]

Epoch 52/100 -- Loss: 0.5812


Training epochs:  53%|█████▎    | 53/100 [03:55<03:27,  4.43s/it]

Epoch 53/100 -- Loss: 0.5806


Training epochs:  54%|█████▍    | 54/100 [03:59<03:23,  4.42s/it]

Epoch 54/100 -- Loss: 0.5801


Training epochs:  55%|█████▌    | 55/100 [04:04<03:19,  4.44s/it]

Epoch 55/100 -- Loss: 0.5800


Training epochs:  56%|█████▌    | 56/100 [04:08<03:15,  4.44s/it]

Epoch 56/100 -- Loss: 0.5791


Training epochs:  57%|█████▋    | 57/100 [04:13<03:10,  4.44s/it]

Epoch 57/100 -- Loss: 0.5789


Training epochs:  58%|█████▊    | 58/100 [04:17<03:06,  4.43s/it]

Epoch 58/100 -- Loss: 0.5783


Training epochs:  59%|█████▉    | 59/100 [04:22<03:02,  4.45s/it]

Epoch 59/100 -- Loss: 0.5781


Training epochs:  60%|██████    | 60/100 [04:26<02:57,  4.44s/it]

Epoch 60/100 -- Loss: 0.5780


Training epochs:  61%|██████    | 61/100 [04:31<02:53,  4.44s/it]

Epoch 61/100 -- Loss: 0.5778


Training epochs:  62%|██████▏   | 62/100 [04:35<02:48,  4.43s/it]

Epoch 62/100 -- Loss: 0.5774


Training epochs:  63%|██████▎   | 63/100 [04:39<02:43,  4.43s/it]

Epoch 63/100 -- Loss: 0.5773


Training epochs:  64%|██████▍   | 64/100 [04:44<02:39,  4.42s/it]

Epoch 64/100 -- Loss: 0.5766


Training epochs:  65%|██████▌   | 65/100 [04:48<02:34,  4.42s/it]

Epoch 65/100 -- Loss: 0.5767


Training epochs:  66%|██████▌   | 66/100 [04:53<02:30,  4.42s/it]

Epoch 66/100 -- Loss: 0.5759


Training epochs:  67%|██████▋   | 67/100 [04:57<02:26,  4.42s/it]

Epoch 67/100 -- Loss: 0.5756


Training epochs:  68%|██████▊   | 68/100 [05:02<02:21,  4.44s/it]

Epoch 68/100 -- Loss: 0.5755


Training epochs:  69%|██████▉   | 69/100 [05:06<02:18,  4.46s/it]

Epoch 69/100 -- Loss: 0.5751


Training epochs:  70%|███████   | 70/100 [05:11<02:13,  4.46s/it]

Epoch 70/100 -- Loss: 0.5753


Training epochs:  71%|███████   | 71/100 [05:15<02:09,  4.46s/it]

Epoch 71/100 -- Loss: 0.5748


Training epochs:  72%|███████▏  | 72/100 [05:19<02:04,  4.44s/it]

Epoch 72/100 -- Loss: 0.5743


Training epochs:  73%|███████▎  | 73/100 [05:24<01:59,  4.43s/it]

Epoch 73/100 -- Loss: 0.5744


Training epochs:  74%|███████▍  | 74/100 [05:28<01:55,  4.42s/it]

Epoch 74/100 -- Loss: 0.5737


Training epochs:  75%|███████▌  | 75/100 [05:33<01:50,  4.43s/it]

Epoch 75/100 -- Loss: 0.5737


Training epochs:  76%|███████▌  | 76/100 [05:37<01:46,  4.43s/it]

Epoch 76/100 -- Loss: 0.5735


Training epochs:  77%|███████▋  | 77/100 [05:42<01:41,  4.43s/it]

Epoch 77/100 -- Loss: 0.5729


Training epochs:  78%|███████▊  | 78/100 [05:46<01:37,  4.43s/it]

Epoch 78/100 -- Loss: 0.5732


Training epochs:  79%|███████▉  | 79/100 [05:50<01:33,  4.43s/it]

Epoch 79/100 -- Loss: 0.5728


Training epochs:  80%|████████  | 80/100 [05:55<01:28,  4.44s/it]

Epoch 80/100 -- Loss: 0.5726


Training epochs:  81%|████████  | 81/100 [05:59<01:24,  4.43s/it]

Epoch 81/100 -- Loss: 0.5720


Training epochs:  82%|████████▏ | 82/100 [06:04<01:19,  4.42s/it]

Epoch 82/100 -- Loss: 0.5716


Training epochs:  83%|████████▎ | 83/100 [06:08<01:15,  4.43s/it]

Epoch 83/100 -- Loss: 0.5715


Training epochs:  84%|████████▍ | 84/100 [06:13<01:10,  4.43s/it]

Epoch 84/100 -- Loss: 0.5717


Training epochs:  85%|████████▌ | 85/100 [06:17<01:06,  4.42s/it]

Epoch 85/100 -- Loss: 0.5711


Training epochs:  86%|████████▌ | 86/100 [06:21<01:01,  4.42s/it]

Epoch 86/100 -- Loss: 0.5710


Training epochs:  87%|████████▋ | 87/100 [06:26<00:57,  4.42s/it]

Epoch 87/100 -- Loss: 0.5709


Training epochs:  88%|████████▊ | 88/100 [06:30<00:52,  4.41s/it]

Epoch 88/100 -- Loss: 0.5704


Training epochs:  89%|████████▉ | 89/100 [06:35<00:48,  4.41s/it]

Epoch 89/100 -- Loss: 0.5704


Training epochs:  90%|█████████ | 90/100 [06:39<00:44,  4.41s/it]

Epoch 90/100 -- Loss: 0.5703


Training epochs:  91%|█████████ | 91/100 [06:43<00:39,  4.42s/it]

Epoch 91/100 -- Loss: 0.5703


Training epochs:  92%|█████████▏| 92/100 [06:48<00:35,  4.43s/it]

Epoch 92/100 -- Loss: 0.5698


Training epochs:  93%|█████████▎| 93/100 [06:52<00:30,  4.43s/it]

Epoch 93/100 -- Loss: 0.5697


Training epochs:  94%|█████████▍| 94/100 [06:57<00:26,  4.43s/it]

Epoch 94/100 -- Loss: 0.5694


Training epochs:  95%|█████████▌| 95/100 [07:01<00:22,  4.44s/it]

Epoch 95/100 -- Loss: 0.5689


Training epochs:  96%|█████████▌| 96/100 [07:06<00:17,  4.44s/it]

Epoch 96/100 -- Loss: 0.5693


Training epochs:  97%|█████████▋| 97/100 [07:10<00:13,  4.47s/it]

Epoch 97/100 -- Loss: 0.5690


Training epochs:  98%|█████████▊| 98/100 [07:15<00:08,  4.47s/it]

Epoch 98/100 -- Loss: 0.5691


Training epochs:  99%|█████████▉| 99/100 [07:19<00:04,  4.46s/it]

Epoch 99/100 -- Loss: 0.5684


Training epochs: 100%|██████████| 100/100 [07:24<00:00,  4.44s/it]

Epoch 100/100 -- Loss: 0.5684
Ternary Classification using Self-trained embeddings Accuracy: 0.7173





# Concating 10 vecters gave better output than the average vector.
# we can see a significant gap between the binary and ternary vectors as the binary classification performs well 



In [36]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

# ----- 1. Function to obtain a fixed-length review embedding matrix -----
def get_review_embedding(review, model, max_length=50):
    """
    Tokenize the review using simple_preprocess.
    For each token found in the Word2Vec model, get its 300-dim vector.
    Truncate if the review is longer than max_length tokens; pad with zero vectors if shorter.
    
    Returns a matrix of shape (max_length, 300).
    """
    tokens = simple_preprocess(str(review))
    lookup = model.wv if hasattr(model, 'wv') else model
    embeddings = []
    for token in tokens[:max_length]:
        if token in lookup.key_to_index:
            embeddings.append(lookup[token])
        else:
            embeddings.append(np.zeros(lookup.vector_size))
    # Pad with zeros if needed.
    while len(embeddings) < max_length:
        embeddings.append(np.zeros(lookup.vector_size))
    return np.array(embeddings)

# ----- 2. Prepare the binary dataset (only sentiments 1 and 2) -----
# Use your existing DataFrame (balanced_data) which has 'review_body' and 'sentiment'
binary_data = balanced_data[balanced_data['sentiment'].isin([1, 2])].copy()
# Map sentiment 1 -> 0 and sentiment 2 -> 1
binary_data['label'] = binary_data['sentiment'].apply(lambda x: 0 if x == 1 else 1)
# (Optional) To reduce memory load during development, limit the dataset to a smaller sample:
binary_data = binary_data.sample(n=2000, random_state=42)

# ----- 3. Load the pretrained Google News Word2Vec Model -----
google_model = api.load("word2vec-google-news-300")

# ----- 4. Generate the review embedding matrices for each review -----
# Each review will be represented as a (50,300) matrix.
features = []
for review in tqdm(binary_data['review_body'], desc="Generating review embeddings"):
    features.append(get_review_embedding(review, google_model, max_length=50))
features = np.array(features)  # shape: (n_samples, 50, 300)
labels = binary_data['label'].values

# ----- 5. Split into Training and Testing Sets -----
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert to PyTorch tensors.
# For CNN, we want input shape: (batch, channels, height, width)
# Here, height=max_length (50 tokens) and width=300 (embedding size). We set channel=1.
X_train = torch.FloatTensor(X_train)  # shape: (N,50,300)
X_test = torch.FloatTensor(X_test)
y_train = torch.LongTensor(y_train)
y_test = torch.LongTensor(y_test)

# Add channel dimension: convert (N,50,300) -> (N,1,50,300)
X_train = X_train.unsqueeze(1)
X_test = X_test.unsqueeze(1)

# ----- 6. Define a Simple CNN Model -----
class TextCNN(nn.Module):
    def __init__(self, num_classes=2):
        super(TextCNN, self).__init__()
        # First conv: use a kernel that spans the entire embedding dimension.
        # Input shape: (batch, 1, 50, 300) -> Output: (batch, 50, 48, 1)
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=50, kernel_size=(3, 300))
        # Second conv: will work on the sequence dimension.
        # Input shape: (batch, 50, 48, 1) -> Output: (batch, 10, 46, 1) if kernel_size=(3,1) and out_channels=10.
        self.conv2 = nn.Conv2d(in_channels=50, out_channels=10, kernel_size=(3, 1))
        self.relu = nn.ReLU()
        # Calculate the flattened feature size: after conv2, height = 48-3+1 = 46 and width = 1.
        self.fc = nn.Linear(10 * 46, num_classes)
    
    def forward(self, x):
        x = self.relu(self.conv1(x))  # (batch,50,48,1)
        x = self.relu(self.conv2(x))  # (batch,10,46,1)
        x = x.view(x.size(0), -1)       # flatten: (batch, 10*46)
        x = self.fc(x)
        return x

# ----- 7. Training Function using mini-batching on CUDA if available -----
def train_cnn(model, X_train, y_train, X_test, y_test, epochs=10, lr=0.001, batch_size=16):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    X_train = X_train.to(device)
    y_train = y_train.to(device)
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} complete. Loss: {epoch_loss/len(train_loader):.4f}")
    
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)
        test_acc = accuracy_score(y_test.cpu().numpy(), predicted.cpu().numpy())
    return test_acc

# ----- 8. Instantiate, Train, and Evaluate the CNN Model -----
cnn_model = TextCNN(num_classes=2)
accuracy_cnn = train_cnn(cnn_model, X_train, y_train, X_test, y_test, epochs=10, lr=0.001, batch_size=16)
print("CNN Accuracy (Binary Classification): {:.4f}".format(accuracy_cnn))


Generating review embeddings: 100%|██████████| 2000/2000 [00:00<00:00, 9384.75it/s]


Epoch 1/10 complete. Loss: 0.6665
Epoch 2/10 complete. Loss: 0.5176
Epoch 3/10 complete. Loss: 0.4361
Epoch 4/10 complete. Loss: 0.3752
Epoch 5/10 complete. Loss: 0.2762
Epoch 6/10 complete. Loss: 0.2116
Epoch 7/10 complete. Loss: 0.1438
Epoch 8/10 complete. Loss: 0.1038
Epoch 9/10 complete. Loss: 0.0604
Epoch 10/10 complete. Loss: 0.0429
CNN Accuracy (Binary Classification): 0.7900


In [37]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

#############################################
# 1. Helper function: Fixed-length Review Embed.
#############################################
def get_review_embedding(review, model, max_length=50):
    """
    Tokenizes the review using gensim's simple_preprocess.
    For each token (up to max_length), retrieves its 300-dim vector
    from the given Word2Vec key–vector mapping.
    
    If the token is not found, a 300-dim zero vector is used.
    The result is a (max_length, 300) NumPy array.
    """
    tokens = simple_preprocess(str(review))
    # If the model has attribute 'wv' use that (for a full Word2Vec model),
    # otherwise use the model directly (if it is a KeyedVectors object).
    lookup = model.wv if hasattr(model, 'wv') else model
    embeddings = []
    for token in tokens[:max_length]:
        if token in lookup.key_to_index:
            embeddings.append(lookup[token])
        else:
            embeddings.append(np.zeros(lookup.vector_size))
    # Pad reviews shorter than max_length with zero vectors.
    while len(embeddings) < max_length:
        embeddings.append(np.zeros(lookup.vector_size))
    return np.array(embeddings)

#############################################
# 2. Prepare Binary Dataset (Sentiments 1 and 2)
#############################################
binary_data = balanced_data[balanced_data['sentiment'].isin([1, 2])].copy()
# Map sentiment 1 -> label 0 and sentiment 2 -> label 1.
binary_data['label'] = binary_data['sentiment'].apply(lambda x: 0 if x == 1 else 1)
# (Optional) To reduce memory usage during testing, sample a subset:
binary_data = binary_data.sample(n=2000, random_state=42)

#############################################
# 3. Load Word2Vec Models
#############################################
# Pre-trained model: Google News vectors.
google_model = api.load("word2vec-google-news-300")
# Self-trained model: assume it’s already built and available as w2v_model.
# (If w2v_model was saved using KeyedVectors, then it does not have a .wv attribute.)

#############################################
# 4. Generate Review Embeddings for each review.
#############################################
# Each review is represented as a matrix of shape (50, 300)
def generate_review_embeddings(df, model, max_length=50):
    features = []
    for review in tqdm(df['review_body'], desc="Embedding reviews"):
        features.append(get_review_embedding(review, model, max_length))
    return np.array(features)

# Create features using the pre-trained model:
X_pre = generate_review_embeddings(binary_data, google_model, max_length=50)
# Create features using the self-trained model:
X_self = generate_review_embeddings(binary_data, w2v_model, max_length=50)
y = binary_data['label'].values

#############################################
# 5. Split the Data (80% Train, 20% Test)
#############################################
X_train_pre, X_test_pre, y_train, y_test = train_test_split(X_pre, y, test_size=0.2, random_state=42)
X_train_self, X_test_self, _, _ = train_test_split(X_self, y, test_size=0.2, random_state=42)

#############################################
# 6. Convert Data to PyTorch Tensors and Reshape
#############################################
# For a CNN, input shape should be (N, channels, height, width).
# Here, height = max_length (50 tokens) and width = 300 (embedding size); we set channels = 1.
def to_tensor(X, y=None):
    X_tensor = torch.FloatTensor(X)
    if y is not None:
        y_tensor = torch.LongTensor(y)
        return X_tensor, y_tensor
    return X_tensor

X_train_pre, y_train = to_tensor(X_train_pre, y_train)
X_test_pre, y_test   = to_tensor(X_test_pre, y_test)
X_train_self = to_tensor(X_train_self)
X_test_self  = to_tensor(X_test_self)

# Add a channel dimension: (N, 50, 300) -> (N, 1, 50, 300)
X_train_pre = X_train_pre.unsqueeze(1)
X_test_pre  = X_test_pre.unsqueeze(1)
X_train_self = X_train_self.unsqueeze(1)
X_test_self  = X_test_self.unsqueeze(1)

#############################################
# 7. Define a Simple CNN for Sentiment Classification
#############################################
class TextCNN(nn.Module):
    def __init__(self, num_classes=2):
        super(TextCNN, self).__init__()
        # First convolution: kernel size covers 3 tokens and entire embedding.
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=50, kernel_size=(3, 300))
        # Second convolution: further extract features from the sequence dimension.
        self.conv2 = nn.Conv2d(in_channels=50, out_channels=10, kernel_size=(3, 1))
        self.relu = nn.ReLU()
        # Calculate flattened feature size:
        # After conv1: if input height=50, kernel_height=3, output height = 50-3+1 = 48.
        # After conv2: output height = 48-3+1 = 46, width remains 1.
        self.fc = nn.Linear(10 * 46, num_classes)
    
    def forward(self, x):
        x = self.relu(self.conv1(x))  # shape: (N, 50, 48, 1)
        x = self.relu(self.conv2(x))  # shape: (N, 10, 46, 1)
        x = x.view(x.size(0), -1)      # flatten to (N, 10*46)
        x = self.fc(x)
        return x

#############################################
# 8. Define the Training Procedure Using Mini-Batches and CUDA
#############################################
def train_cnn(model, X_train, y_train, X_test, y_test, epochs=10, lr=0.001, batch_size=16):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    X_train = X_train.to(device)
    y_train = y_train.to(device)
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} -- Loss: {epoch_loss/len(train_loader):.4f}")
    
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)
        test_acc = accuracy_score(y_test.cpu().numpy(), predicted.cpu().numpy())
    return test_acc

#############################################
# 9. Train and Evaluate the CNN Models
#############################################
# (A) Using Pre-trained Embeddings:
cnn_model_pre = TextCNN(num_classes=2)
acc_cnn_pre = train_cnn(cnn_model_pre, X_train_pre, y_train, X_test_pre, y_test,
                        epochs=10, lr=0.001, batch_size=16)
print("CNN Accuracy (Pre-trained): {:.4f}".format(acc_cnn_pre))

# (B) Using Self-trained Embeddings:
cnn_model_self = TextCNN(num_classes=2)
acc_cnn_self = train_cnn(cnn_model_self, X_train_self, y_train, X_test_self, y_test,
                         epochs=10, lr=0.001, batch_size=16)
print("CNN Accuracy (Self-trained): {:.4f}".format(acc_cnn_self))


Embedding reviews: 100%|██████████| 2000/2000 [00:00<00:00, 8717.12it/s]
Embedding reviews: 100%|██████████| 2000/2000 [00:00<00:00, 10572.20it/s]


Epoch 1/10 -- Loss: 0.6498
Epoch 2/10 -- Loss: 0.5380
Epoch 3/10 -- Loss: 0.4480
Epoch 4/10 -- Loss: 0.3896
Epoch 5/10 -- Loss: 0.3277
Epoch 6/10 -- Loss: 0.2472
Epoch 7/10 -- Loss: 0.1773
Epoch 8/10 -- Loss: 0.1120
Epoch 9/10 -- Loss: 0.0777
Epoch 10/10 -- Loss: 0.0365
CNN Accuracy (Pre-trained): 0.8000
Epoch 1/10 -- Loss: 0.6444
Epoch 2/10 -- Loss: 0.4004
Epoch 3/10 -- Loss: 0.2281
Epoch 4/10 -- Loss: 0.1000
Epoch 5/10 -- Loss: 0.0333
Epoch 6/10 -- Loss: 0.0151
Epoch 7/10 -- Loss: 0.0080
Epoch 8/10 -- Loss: 0.0059
Epoch 9/10 -- Loss: 0.0049
Epoch 10/10 -- Loss: 0.0041
CNN Accuracy (Self-trained): 0.7875


In [27]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

#############################################
# 1. Helper function to generate a fixed-length review embedding matrix
#############################################
def get_review_embedding(review, model, max_length=50):
    """
    Tokenizes the review using gensim's simple_preprocess.
    Retrieves a 300-dimensional vector for each token from the given model.
    If the token is not found, uses a zero vector.
    Truncates to max_length tokens or pads with zeros if needed.
    Returns a numpy array of shape (max_length, 300).
    """
    tokens = simple_preprocess(str(review))
    # Use model.wv if available (for a full Word2Vec model), otherwise use model directly.
    lookup = model.wv if hasattr(model, 'wv') else model
    embeddings = []
    for token in tokens[:max_length]:
        if token in lookup.key_to_index:
            embeddings.append(lookup[token])
        else:
            embeddings.append(np.zeros(lookup.vector_size))
    while len(embeddings) < max_length:
        embeddings.append(np.zeros(lookup.vector_size))
    return np.array(embeddings)

#############################################
# 2. Prepare the Ternary Dataset
#############################################
# Map sentiment 1,2,3 to labels 0,1,2.
ternary_data = balanced_data.copy()
ternary_data['label'] = ternary_data['sentiment'] - 1
# To limit resource usage, sample a subset (adjust n as needed).
ternary_data = ternary_data.sample(n=2000, random_state=42)

#############################################
# 3. Load Word2Vec Models
#############################################
# Pre-trained model (Google News):
google_model = api.load("word2vec-google-news-300")
# Self-trained model is assumed available as w2v_model.

#############################################
# 4. Generate fixed-length review embeddings for each review
#############################################
def generate_review_embeddings(df, model, max_length=50):
    features = []
    for review in tqdm(df['review_body'], desc="Generating review embeddings"):
        features.append(get_review_embedding(review, model, max_length))
    return np.array(features)

# Generate features for both variants:
X_tern_pre = generate_review_embeddings(ternary_data, google_model, max_length=50)
X_tern_self = generate_review_embeddings(ternary_data, w2v_model, max_length=50)
y_tern = ternary_data['label'].values

#############################################
# 5. Split Data into Training and Testing Sets (80/20 Split)
#############################################
X_train_pre, X_test_pre, y_train, y_test = train_test_split(X_tern_pre, y_tern, test_size=0.2, random_state=3)
X_train_self, X_test_self, _, _ = train_test_split(X_tern_self, y_tern, test_size=0.2, random_state=3)

#############################################
# 6. Convert to PyTorch Tensors and reshape for CNN input
#############################################
def to_tensor(X, y=None):
    X_tensor = torch.FloatTensor(X)
    if y is not None:
        y_tensor = torch.LongTensor(y)
    else:
        y_tensor = None
    return X_tensor, y_tensor

X_train_pre, y_train = to_tensor(X_train_pre, y_train)
X_test_pre, y_test   = to_tensor(X_test_pre, y_test)
X_train_self, _ = to_tensor(X_train_self, None)
X_test_self, _  = to_tensor(X_test_self, None)

# For CNN, add a channel dimension:
# Current shape: (N, 50, 300) -> Desired shape: (N, 1, 50, 300)
X_train_pre = X_train_pre.unsqueeze(1)
X_test_pre  = X_test_pre.unsqueeze(1)
X_train_self = X_train_self.unsqueeze(1)
X_test_self  = X_test_self.unsqueeze(1)

#############################################
# 7. Define the CNN Model Architecture for Ternary Classification
#############################################
class TextCNN(nn.Module):
    def __init__(self, num_classes=3):
        super(TextCNN, self).__init__()
        # First convolution: kernel spans 3 tokens and entire embedding dimension.
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=50, kernel_size=(3, 300))
        # Second convolution: further processes the sequence (height) dimension.
        self.conv2 = nn.Conv2d(in_channels=50, out_channels=10, kernel_size=(3, 1))
        self.relu = nn.ReLU()
        # After conv1: height = 50-3+1 = 48.
        # After conv2: height = 48-3+1 = 46; width remains 1.
        self.fc = nn.Linear(10 * 46, num_classes)
    
    def forward(self, x):
        x = self.relu(self.conv1(x))  # (N, 50, 48, 1)
        x = self.relu(self.conv2(x))  # (N, 10, 46, 1)
        x = x.view(x.size(0), -1)       # Flatten to (N, 10*46)
        x = self.fc(x)
        return x

#############################################
# 8. Define the Training Function (with mini-batching and CUDA)
#############################################
def train_cnn(model, X_train, y_train, X_test, y_test, epochs=100, lr=0.00001, batch_size=8):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    X_train = X_train.to(device)
    y_train = y_train.to(device)
    X_test  = X_test.to(device)
    y_test  = y_test.to(device)
    
    train_dataset = TensorDataset(X_train, y_train)
    train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} -- Loss: {epoch_loss/len(train_loader):.4f}")
    
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)
        test_acc = accuracy_score(y_test.cpu().numpy(), predicted.cpu().numpy())
    return test_acc

#############################################
# 9. Train and Evaluate the CNN for Ternary Classification
#############################################
# (A) Using Pre-trained Embeddings:
cnn_model_tern_pre = TextCNN(num_classes=3)
acc_tern_pre = train_cnn(cnn_model_tern_pre, X_train_pre, y_train, X_test_pre, y_test, epochs=100, lr=0.00001, batch_size=8)
print("Ternary Classification using Pre-trained Embeddings Accuracy: {:.4f}".format(acc_tern_pre))

# (B) Using Self-trained Embeddings:
cnn_model_tern_self = TextCNN(num_classes=3)
acc_tern_self = train_cnn(cnn_model_tern_self, X_train_self, y_train, X_test_self, y_test, epochs=100, lr=0.00001, batch_size=8)
print("Ternary Classification using Self-trained Embeddings Accuracy: {:.4f}".format(acc_tern_self))


Generating review embeddings: 100%|██████████| 2000/2000 [00:00<00:00, 6932.16it/s]
Generating review embeddings: 100%|██████████| 2000/2000 [00:00<00:00, 7359.99it/s]


Epoch 1/100 -- Loss: 1.0929
Epoch 2/100 -- Loss: 1.0856
Epoch 3/100 -- Loss: 1.0774
Epoch 4/100 -- Loss: 1.0686
Epoch 5/100 -- Loss: 1.0596
Epoch 6/100 -- Loss: 1.0518
Epoch 7/100 -- Loss: 1.0460
Epoch 8/100 -- Loss: 1.0419
Epoch 9/100 -- Loss: 1.0390
Epoch 10/100 -- Loss: 1.0368
Epoch 11/100 -- Loss: 1.0347
Epoch 12/100 -- Loss: 1.0328
Epoch 13/100 -- Loss: 1.0311
Epoch 14/100 -- Loss: 1.0291
Epoch 15/100 -- Loss: 1.0272
Epoch 16/100 -- Loss: 1.0252
Epoch 17/100 -- Loss: 1.0232
Epoch 18/100 -- Loss: 1.0209
Epoch 19/100 -- Loss: 1.0187
Epoch 20/100 -- Loss: 1.0163
Epoch 21/100 -- Loss: 1.0139
Epoch 22/100 -- Loss: 1.0112
Epoch 23/100 -- Loss: 1.0087
Epoch 24/100 -- Loss: 1.0058
Epoch 25/100 -- Loss: 1.0029
Epoch 26/100 -- Loss: 0.9999
Epoch 27/100 -- Loss: 0.9969
Epoch 28/100 -- Loss: 0.9938
Epoch 29/100 -- Loss: 0.9905
Epoch 30/100 -- Loss: 0.9871
Epoch 31/100 -- Loss: 0.9836
Epoch 32/100 -- Loss: 0.9800
Epoch 33/100 -- Loss: 0.9766
Epoch 34/100 -- Loss: 0.9727
Epoch 35/100 -- Loss: 0

# The best accuracy for the above code was obtained when I computed on CARC A100 gpus but they were not available during final execution, so here is the best i could get for the above code.
