In [50]:
# Cell 1: Imports + NLTK



import pandas as pd
import numpy as np
import re, string, time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# ---- Alternative Classifier: Logistic Regression ----
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# ---- Alternative Classifier: SVC ----
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# ---- Alternative Classifier: K-Nearest Neighbors ----
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# ---- Alternative Classifier: Neural Network (MLP) ----
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# Optional (sirf agar evaluation chahiye ho to)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data (run once)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

print("✅ Libraries ready")


✅ Libraries ready


In [51]:
# from google.colab import drive
# drive.mount('/content/drive')

In [52]:
# Cell 2: Load train and test data

# OPTION A: If you’re in Google Colab, uncomment to upload files manually:
# from google.colab import files
# files.upload()   # upload en_train.csv and Comments_Text.xlsx

# OPTION B: If files are already in the runtime / working directory, just read:
df_train = pd.read_csv('en_train.csv')              # expects columns: text, binary
df_test  = pd.read_excel('Comments_Text.xlsx')      # expects column: comments_text

print("Train columns:", df_train.columns.tolist())
print("Test columns :", df_test.columns.tolist())

# # Basic sanity
# assert 'text' in df_train.columns, "df_train must have a 'text' column"
# assert 'binary' in df_train.columns, "df_train must have a 'binary' column (Hope/Not Hope)"
# assert 'comments_text' in df_test.columns, "df_test must have 'comments_text' column"

# # Drop NaNs for safety
# df_train['text'] = df_train['text'].fillna('')
# df_train['binary'] = df_train['binary'].fillna('')
# df_test['comments_text'] = df_test['comments_text'].fillna('')

print("✅ Data loaded")
df_train.head()
df_test.head()

Train columns: ['text', 'binary', 'multiclass']
Test columns : ['comments_text']
✅ Data loaded


Unnamed: 0,comments_text
0,"So, when you ask what the two illnesses are......"
1,"In addition, people with BPD quite often have ..."
2,"Borderline Personality Disorder, like all othe..."
3,LONG ANSWER: Bipolar disease is caused by a ch...
4,Well think of bipolar as a rollar coaster you ...


In [53]:
df_train.head()

Unnamed: 0,text,binary,multiclass
0,#USER# #USER# #USER# #USER# You expect a man t...,Not Hope,Not Hope
1,#USER# #USER# #USER# #USER# Tinubu is actually...,Not Hope,Not Hope
2,it'd be nice if missguided actually had stock ...,Hope,Sarcasm
3,#USER# Anyway love u bubbly i know i can count...,Hope,Generalized Hope
4,“you have a lot of people rooting for you whet...,Not Hope,Not Hope


In [54]:
# Cell 3: Preprocessing
def pre_process(tweet):
    if not isinstance(tweet, str):
        return ""

    tweet = tweet.lower()

    STOPWORDS = stopwords.words("english") + [
        'u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'  # Social media specific
    ]
    # Remove extra whitespace

    nopunc = ''.join([char for char in tweet if char not in string.punctuation])  # Changed 'text' to 'tweet'
    words = [word for word in nopunc.split() if word.lower() not in STOPWORDS]  # Changed 'STOPWORD' to 'STOPWORDS' and 'word' to 'words'

      # Optional: Lemmatization (uncomment if you want to use it)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]


    return ' '.join(words)  # Changed 'word' to 'words'

# Apply
df_train['clean_tweets'] = df_train['text'].astype(str).apply(pre_process)
df_test['clean_tweets']  = df_test['comments_text'].astype(str).apply(pre_process)

print("✅ Preprocessing done")
df_train.head()
#df_train[['text','clean_tweets','binary']].head()


✅ Preprocessing done


Unnamed: 0,text,binary,multiclass,clean_tweets
0,#USER# #USER# #USER# #USER# You expect a man t...,Not Hope,Not Hope,user user user user expect man literally refer...
1,#USER# #USER# #USER# #USER# Tinubu is actually...,Not Hope,Not Hope,user user user user tinubu actually bonus lols...
2,it'd be nice if missguided actually had stock ...,Hope,Sarcasm,itd nice missguided actually stock once�������...
3,#USER# Anyway love u bubbly i know i can count...,Hope,Generalized Hope,user anyway love bubbly know count fairy tail😌🤚
4,“you have a lot of people rooting for you whet...,Not Hope,Not Hope,“you lot people rooting whether believe should...


In [55]:
df_test.head()

Unnamed: 0,comments_text,clean_tweets
0,"So, when you ask what the two illnesses are......",ask two illness aretheyre similar tend moodine...
1,"In addition, people with BPD quite often have ...",addition people bpd quite often weak self conc...
2,"Borderline Personality Disorder, like all othe...",borderline personality disorder like personali...
3,LONG ANSWER: Bipolar disease is caused by a ch...,long answer bipolar disease caused chemical im...
4,Well think of bipolar as a rollar coaster you ...,well think bipolar rollar coaster high low pea...


In [56]:
# Cell 4: Labels mapping — ensure only expected classes

binary_mapping = {"Hope": 1, "Not Hope": 0}
df_train = df_train[df_train['binary'].isin(binary_mapping .keys())].copy()
df_train['label'] = df_train['binary'].map(binary_mapping )

X_train_text = df_train['clean_tweets']
y_train      = df_train['label']  # numeric 0/1

print("Class counts:", df_train['label'].value_counts().to_dict())
print("✅ Labels ready")



Class counts: {0: 2807, 1: 2426}
✅ Labels ready


In [57]:
# Cell 5: Vectorize + Train
# MultinomialNB works best with counts (binary=False). If you want binary features, use BernoulliNB.
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2, max_df=0.95) # keep default counts
X_train = vectorizer.fit_transform(X_train_text)

start = time.time()
nb = MultinomialNB()
nb.fit(X_train, y_train)
print("⏱ Training Time: {:.3f} sec".format(time.time() - start))
print("✅ Model trained")



⏱ Training Time: 0.003 sec
✅ Model trained


In [58]:
# Cell 6: Predict on df_test (unlabeled)

X_test = vectorizer.transform(df_test['clean_tweets'])
pred_ids = nb.predict(X_test)

id2label = {0: 'Not Hope', 1: 'Hope'}
df_test['PredLabel'] = pd.Series(pred_ids).map(id2label ).fillna(pd.Series(pred_ids))

print("✅ Predictions ready")
df_test[['comments_text','PredLabel',]].head(10)


✅ Predictions ready


Unnamed: 0,comments_text,PredLabel
0,"So, when you ask what the two illnesses are......",Not Hope
1,"In addition, people with BPD quite often have ...",Not Hope
2,"Borderline Personality Disorder, like all othe...",Not Hope
3,LONG ANSWER: Bipolar disease is caused by a ch...,Not Hope
4,Well think of bipolar as a rollar coaster you ...,Not Hope
5,"Believe me, I show you much more than anyone. ...",Hope
6,"Yes, I feel the exact same way. My family avoi...",Not Hope
7,I was hospitalized in May for the same reason....,Not Hope
8,I feel like that and im hearing you loud okay ...,Hope
9,I had 9 rounds over 3.5 weeks two years ago. I...,Hope


In [59]:
# Cell 7 (Optional): Quick internal validation from training set (holdout)
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

nb_val = MultinomialNB()
nb_val.fit(X_tr, y_tr)
val_pred = nb_val.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, val_pred))
print(classification_report(y_val, val_pred, target_names=['Not Hope','Hope']))


Validation accuracy: 0.7640878701050621
              precision    recall  f1-score   support

    Not Hope       0.75      0.85      0.79       562
        Hope       0.79      0.67      0.72       485

    accuracy                           0.76      1047
   macro avg       0.77      0.76      0.76      1047
weighted avg       0.77      0.76      0.76      1047



In [60]:
# (A) Train SVC on full training data for test predictions
start = time.time()
svc = LinearSVC(max_iter=5000)  # LinearSVC TF-IDF data pe best hota hai
svc.fit(X_train, y_train)
print("⏱ SVC Training Time: {:.3f} sec".format(time.time() - start))
print("✅ SVC model trained")


⏱ SVC Training Time: 0.031 sec
✅ SVC model trained


In [61]:
# Predict for df_test (unlabeled), parallel to your NB flow
X_test = vectorizer.transform(df_test['clean_tweets'])
pred_ids_svc = svc.predict(X_test)

id2label = {0: 'Not Hope', 1: 'Hope'}
df_test['PredLabel_SVC'] = pd.Series(pred_ids_svc).map(id2label).fillna(pd.Series(pred_ids_svc))

print("✅ SVC Predictions ready")
print(df_test[['comments_text','PredLabel_SVC']].head(10))



✅ SVC Predictions ready
                                       comments_text PredLabel_SVC
0  So, when you ask what the two illnesses are......      Not Hope
1  In addition, people with BPD quite often have ...      Not Hope
2  Borderline Personality Disorder, like all othe...      Not Hope
3  LONG ANSWER: Bipolar disease is caused by a ch...      Not Hope
4  Well think of bipolar as a rollar coaster you ...      Not Hope
5  Believe me, I show you much more than anyone. ...          Hope
6  Yes, I feel the exact same way. My family avoi...      Not Hope
7  I was hospitalized in May for the same reason....      Not Hope
8  I feel like that and im hearing you loud okay ...          Hope
9  I had 9 rounds over 3.5 weeks two years ago. I...          Hope


In [62]:
# (B) Internal validation (same style as your Cell 7)
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

svc_val = LinearSVC(max_iter=5000)
svc_val.fit(X_tr, y_tr)
val_pred_svc = svc_val.predict(X_val)

print("\n🎯 Validation accuracy (SVC):", accuracy_score(y_val, val_pred_svc))
print("📊 Classification Report (SVC)")
print(classification_report(y_val, val_pred_svc, target_names=['Not Hope','Hope']))

print("🧩 Confusion Matrix (SVC)")
print(confusion_matrix(y_val, val_pred_svc))


🎯 Validation accuracy (SVC): 0.775549188156638
📊 Classification Report (SVC)
              precision    recall  f1-score   support

    Not Hope       0.78      0.81      0.79       562
        Hope       0.77      0.74      0.75       485

    accuracy                           0.78      1047
   macro avg       0.77      0.77      0.77      1047
weighted avg       0.78      0.78      0.78      1047

🧩 Confusion Matrix (SVC)
[[453 109]
 [126 359]]


In [63]:
# (A) Train KNN on full training data for test predictions
start = time.time()
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)  # default k=5
knn.fit(X_train, y_train)
print("⏱ KNN Training Time: {:.3f} sec".format(time.time() - start))
print("✅ KNN model trained")


⏱ KNN Training Time: 0.003 sec
✅ KNN model trained


In [64]:
# Predict for df_test (unlabeled), parallel to your NB flow
X_test = vectorizer.transform(df_test['clean_tweets'])
pred_ids_knn = knn.predict(X_test)

id2label = {0: 'Not Hope', 1: 'Hope'}
df_test['PredLabel_KNN'] = pd.Series(pred_ids_knn).map(id2label).fillna(pd.Series(pred_ids_knn))

print("✅ KNN Predictions ready")
print(df_test[['comments_text','PredLabel_KNN']].head(10))


✅ KNN Predictions ready
                                       comments_text PredLabel_KNN
0  So, when you ask what the two illnesses are......      Not Hope
1  In addition, people with BPD quite often have ...      Not Hope
2  Borderline Personality Disorder, like all othe...      Not Hope
3  LONG ANSWER: Bipolar disease is caused by a ch...      Not Hope
4  Well think of bipolar as a rollar coaster you ...      Not Hope
5  Believe me, I show you much more than anyone. ...          Hope
6  Yes, I feel the exact same way. My family avoi...      Not Hope
7  I was hospitalized in May for the same reason....          Hope
8  I feel like that and im hearing you loud okay ...      Not Hope
9  I had 9 rounds over 3.5 weeks two years ago. I...          Hope


In [65]:
# (B) Internal validation (same style as your Cell 7)
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

knn_val = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn_val.fit(X_tr, y_tr)
val_pred_knn = knn_val.predict(X_val)

print("\n🎯 Validation accuracy (KNN):", accuracy_score(y_val, val_pred_knn))
print("📊 Classification Report (KNN)")
print(classification_report(y_val, val_pred_knn, target_names=['Not Hope','Hope']))

print("🧩 Confusion Matrix (KNN)")
print(confusion_matrix(y_val, val_pred_knn))


🎯 Validation accuracy (KNN): 0.6370582617000955
📊 Classification Report (KNN)
              precision    recall  f1-score   support

    Not Hope       0.65      0.70      0.67       562
        Hope       0.62      0.56      0.59       485

    accuracy                           0.64      1047
   macro avg       0.63      0.63      0.63      1047
weighted avg       0.64      0.64      0.64      1047

🧩 Confusion Matrix (KNN)
[[394 168]
 [212 273]]


In [66]:
start = time.time()
lr = LogisticRegression(solver='saga', max_iter=5000)  # 'saga' works well for sparse TF-IDF
lr.fit(X_train, y_train)
print("⏱ LR Training Time: {:.3f} sec".format(time.time() - start))
print("✅ Logistic Regression model trained")


⏱ LR Training Time: 0.080 sec
✅ Logistic Regression model trained


In [67]:
# Predict for df_test (unlabeled), parallel to your NB flow
X_test = vectorizer.transform(df_test['clean_tweets'])
pred_ids_lr = lr.predict(X_test)

id2label = {0: 'Not Hope', 1: 'Hope'}
df_test['PredLabel_LR'] = pd.Series(pred_ids_lr).map(id2label).fillna(pd.Series(pred_ids_lr))

print("✅ LR Predictions ready")
print(df_test[['comments_text','PredLabel_LR']].head(10))


✅ LR Predictions ready
                                       comments_text PredLabel_LR
0  So, when you ask what the two illnesses are......     Not Hope
1  In addition, people with BPD quite often have ...     Not Hope
2  Borderline Personality Disorder, like all othe...     Not Hope
3  LONG ANSWER: Bipolar disease is caused by a ch...     Not Hope
4  Well think of bipolar as a rollar coaster you ...     Not Hope
5  Believe me, I show you much more than anyone. ...         Hope
6  Yes, I feel the exact same way. My family avoi...     Not Hope
7  I was hospitalized in May for the same reason....     Not Hope
8  I feel like that and im hearing you loud okay ...         Hope
9  I had 9 rounds over 3.5 weeks two years ago. I...         Hope


In [68]:
# (B) Internal validation (same style as your Cell 7)
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

lr_val = LogisticRegression(solver='saga', max_iter=5000)
lr_val.fit(X_tr, y_tr)
val_pred_lr = lr_val.predict(X_val)

print("\n🎯 Validation accuracy (Logistic Regression):", accuracy_score(y_val, val_pred_lr))
print("📊 Classification Report (LR)")
print(classification_report(y_val, val_pred_lr, target_names=['Not Hope','Hope']))

print("🧩 Confusion Matrix (LR)")
print(confusion_matrix(y_val, val_pred_lr))


🎯 Validation accuracy (Logistic Regression): 0.7812798471824259
📊 Classification Report (LR)
              precision    recall  f1-score   support

    Not Hope       0.77      0.84      0.80       562
        Hope       0.79      0.72      0.75       485

    accuracy                           0.78      1047
   macro avg       0.78      0.78      0.78      1047
weighted avg       0.78      0.78      0.78      1047

🧩 Confusion Matrix (LR)
[[470  92]
 [137 348]]


In [69]:
# Cell 8: Save outputs
out_cols = ['comments_text', 'clean_tweets',  'PredLabel']
# if 'Confidence' in df_test.columns:
#     out_cols.append('Confidence')

df_test[out_cols].to_excel('Comments_Text_with_Predictions.xlsx', index=False)
df_test[out_cols].to_csv('Comments_Text_with_Predictions.csv', index=False, encoding='utf-8-sig')

print("💾 Saved:")
print(" - Comments_Text_with_Predictions.xlsx")
print(" - Comments_Text_with_Predictions.csv")


💾 Saved:
 - Comments_Text_with_Predictions.xlsx
 - Comments_Text_with_Predictions.csv


In [71]:
from google.colab import files

# Download Excel
# files.download('Comments_Text_with_Predictions.xlsx')

# Or download CSV
files.download('Comments_Text_with_Predictions.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>