In [1]:
!pip install scikit-learn gensim

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

from gensim.models import Word2Vec


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
2.#Load the Kaggle dataset (10k sample)
import pandas as pd

df = pd.read_csv('/content/train-processed-sample.csv', encoding='utf-8')
df.head()
df.shape


(10000, 3)

In [6]:
#Assume the columns are text and sentiment
df = df[['text', 'sentiment']].dropna()
df = df.sample(n=10000, random_state=42)
df.head()


Unnamed: 0,text,sentiment
7217,@machfairy hahahahha. i knew you wanted to be ...,1
8291,@JOBROLOVER718 yea... It gets annoying when th...,1
4607,I want smores. Also the Samsung Alias looks re...,0
5114,@tanushreebaruah no start sports at work We n...,0
1859,you're all a bunch of revolting cocks,1


In [8]:
#Train–test split:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'],
    df['sentiment'],
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment']
)


In [9]:
#3. CountVectorizer with ngram_range=(1,3)
cv = CountVectorizer(ngram_range=(1,3), stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv  = cv.transform(X_test)

X_train_cv.shape, X_test_cv.shape


((8000, 101090), (2000, 101090))

In [10]:
#3.1 Logistic Regression + CountVectorizer
log_reg_cv = LogisticRegression(max_iter=1000)
log_reg_cv.fit(X_train_cv, y_train)
pred_cv_lr = log_reg_cv.predict(X_test_cv)

print("CountVectorizer (1,3) + LR accuracy:", accuracy_score(y_test, pred_cv_lr))
print(classification_report(y_test, pred_cv_lr))


CountVectorizer (1,3) + LR accuracy: 0.717
              precision    recall  f1-score   support

           0       0.72      0.70      0.71       999
           1       0.71      0.73      0.72      1001

    accuracy                           0.72      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.72      0.72      0.72      2000



In [11]:
#3.2 MultinomialNB + CountVectorizer
nb_cv = MultinomialNB()
nb_cv.fit(X_train_cv, y_train)
pred_cv_nb = nb_cv.predict(X_test_cv)

print("CountVectorizer (1,3) + MultinomialNB accuracy:", accuracy_score(y_test, pred_cv_nb))
print(classification_report(y_test, pred_cv_nb))


CountVectorizer (1,3) + MultinomialNB accuracy: 0.7205
              precision    recall  f1-score   support

           0       0.71      0.75      0.73       999
           1       0.74      0.69      0.71      1001

    accuracy                           0.72      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.72      0.72      0.72      2000



In [12]:
#4. TfidfVectorizer with ngram_range=(1,3)

tfidf = TfidfVectorizer(ngram_range=(1,3), stop_words='english')
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf  = tfidf.transform(X_test)

X_train_tf.shape, X_test_tf.shape

((8000, 101090), (2000, 101090))

In [13]:
#4.1 Logistic Regression + TF–IDF

log_reg_tf = LogisticRegression(max_iter=1000)
log_reg_tf.fit(X_train_tf, y_train)
pred_tf_lr = log_reg_tf.predict(X_test_tf)

print("TFIDF (1,3) + LR accuracy:", accuracy_score(y_test, pred_tf_lr))
print(classification_report(y_test, pred_tf_lr))

TFIDF (1,3) + LR accuracy: 0.709
              precision    recall  f1-score   support

           0       0.71      0.70      0.71       999
           1       0.71      0.71      0.71      1001

    accuracy                           0.71      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.71      0.71      0.71      2000



In [14]:
#4.2 MultinomialNB + TF–IDF
nb_tf = MultinomialNB()
nb_tf.fit(X_train_tf, y_train)
pred_tf_nb = nb_tf.predict(X_test_tf)

print("TFIDF (1,3) + MultinomialNB accuracy:", accuracy_score(y_test, pred_tf_nb))
print(classification_report(y_test, pred_tf_nb))

TFIDF (1,3) + MultinomialNB accuracy: 0.725
              precision    recall  f1-score   support

           0       0.71      0.75      0.73       999
           1       0.74      0.70      0.72      1001

    accuracy                           0.72      2000
   macro avg       0.73      0.73      0.72      2000
weighted avg       0.73      0.72      0.72      2000



In [16]:
#5. Word2Vec averaged embeddings
#Using average of word vectors per
# Tokenize
train_tokens = [t.split() for t in X_train]
test_tokens  = [t.split() for t in X_test]

w2v_model = Word2Vec(
    sentences=train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=10
)

def doc_vector(tokens):
    vecs = [w2v_model.wv[w] for w in tokens if w in w2v_model.wv]
    return np.mean(vecs, axis=0) if len(vecs) > 0 else np.zeros(100)

X_train_w2v = np.vstack([doc_vector(toks) for toks in train_tokens])
X_test_w2v  = np.vstack([doc_vector(toks) for toks in test_tokens])

X_train_w2v.shape, X_test_w2v.shape

log_reg_w2v = LogisticRegression(max_iter=1000)
log_reg_w2v.fit(X_train_w2v, y_train)
pred_w2v_lr = log_reg_w2v.predict(X_test_w2v)

print("Word2Vec avg + LR accuracy:", accuracy_score(y_test, pred_w2v_lr))
print(classification_report(y_test, pred_w2v_lr))

Word2Vec avg + LR accuracy: 0.634
              precision    recall  f1-score   support

           0       0.64      0.62      0.63       999
           1       0.63      0.65      0.64      1001

    accuracy                           0.63      2000
   macro avg       0.63      0.63      0.63      2000
weighted avg       0.63      0.63      0.63      2000



In [28]:
# 6. Hyperparameter tuning with GridSearchCV

from sklearn.model_selection import GridSearchCV

# 6.1 Tune C for Logistic Regression (TF‑IDF features)
param_grid_lr = {'C': [0.01, 0.1, 1, 10]}

grid_lr = GridSearchCV(
    LogisticRegression(max_iter=1000),
    param_grid_lr,
    cv=3,
    n_jobs=-1,
    scoring='accuracy'
)

grid_lr.fit(X_train_tf, y_train)

print("Best C for LR:", grid_lr.best_params_['C'])
print("Best CV score (LR):", grid_lr.best_score_)

best_lr = grid_lr.best_estimator_
pred_best_lr = best_lr.predict(X_test_tf)
print("Test accuracy (best LR + TFIDF):", accuracy_score(y_test, pred_best_lr))


# 6.2 Tune alpha for MultinomialNB (CountVectorizer features)
param_grid_nb = {'alpha': [0.01, 0.1, 1, 5, 10]}

grid_nb = GridSearchCV(
    MultinomialNB(),
    param_grid_nb,
    cv=3,
    n_jobs=-1,
    scoring='accuracy'
)

grid_nb.fit(X_train_cv, y_train)

print("Best alpha for NB:", grid_nb.best_params_['alpha'])
print("Best CV score (NB):", grid_nb.best_score_)

best_nb = grid_nb.best_estimator_
pred_best_nb = best_nb.predict(X_test_cv)
print("Test accuracy (best NB + CountVectorizer):", accuracy_score(y_test, pred_best_nb))


Best C for LR: 10
Best CV score (LR): 0.7127505535926538
Test accuracy (best LR + TFIDF): 0.718
Best alpha for NB: 5
Best CV score (NB): 0.7068758190672527
Test accuracy (best NB + CountVectorizer): 0.7225


In [29]:
results = pd.DataFrame([
    ['Count (1,3)', 'LogReg', accuracy_score(y_test, pred_cv_lr)],
    ['Count (1,3)', 'MultinomialNB', accuracy_score(y_test, pred_cv_nb)],
    ['TFIDF (1,3)', 'LogReg', accuracy_score(y_test, pred_tf_lr)],
    ['TFIDF (1,3)', 'MultinomialNB', accuracy_score(y_test, pred_tf_nb)],
    ['Word2Vec avg', 'LogReg', accuracy_score(y_test, pred_w2v_lr)],
    ['TFIDF (1,3)', 'LogReg (GridSearch best)', accuracy_score(y_test, pred_best_lr)],
    ['Count (1,3)', 'MultinomialNB (GridSearch best)', accuracy_score(y_test, pred_best_nb)]
], columns=['Vectorizer', 'Model', 'Test accuracy'])

results


Unnamed: 0,Vectorizer,Model,Test accuracy
0,"Count (1,3)",LogReg,0.717
1,"Count (1,3)",MultinomialNB,0.7205
2,"TFIDF (1,3)",LogReg,0.709
3,"TFIDF (1,3)",MultinomialNB,0.725
4,Word2Vec avg,LogReg,0.634
5,"TFIDF (1,3)",LogReg (GridSearch best),0.718
6,"Count (1,3)",MultinomialNB (GridSearch best),0.7225
