In [5]:
# Import necessary libraries for data manipulation, visualization, and machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import math
import time

"""

pandas → Structured data (jaise CSV files) ko read, manipulate aur analyze karne ke liye use hoti hai. Mostly DataFrame objects mein kaam hota hai.

Example: pd.read_csv() se CSV file load karte hain.

numpy → Mathematical operations ke liye use hoti hai. Arrays, statistics, etc. handle karta hai.

Example: np.mean(), np.array()

matplotlib.pyplot → Graphs aur charts (jaise bar chart, pie chart) banane ke liye basic library.

seaborn → matplotlib ka advanced version hai. Stylish graphs banata hai, especially for DataFrames.

re → Regular Expressions: Text cleaning jaise unwanted characters nikaalne mein kaam aata hai.

string → English alphabets, punctuation marks jaise cheezon ka access deta hai.

math → Mathematical functions like square root, factorial, etc.

time → Time tracking ke liye useful hota hai (e.g., kisi function ko run karne mein kitna time laga).



"""

# Sklearn libraries for machine learning and text processing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

"""
CountVectorizer, TfidfVectorizer → Text ko numbers mein convert karne ke liye (Bag of Words ya TF-IDF model)

MultinomialNB → Naive Bayes classifier (mostly spam detection ya text classification ke liye)

LogisticRegression → Ek simple aur effective classifier model (binary/multi-class classification ke liye)

train_test_split → Dataset ko training aur testing mein split karne ke liye

metrics → Model ki performance evaluate karne ke liye (accuracy, precision, recall, etc.)



"""

# NLTK libraries for text processing (lemmatization, stemming, stopwords, POS tagging)
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

"""
nltk → Natural language processing ke liye ek standard library hai

stopwords → Common words like "is", "the", "and" jo usually analysis mein kaam ke nahi hote

wordnet → Word meanings aur relationships ke liye

WordNetLemmatizer → Word ko uske original form mein le aata hai (e.g., running → run)

PorterStemmer → Similar to lemmatization, but thoda aggressive hota hai (e.g., jumping → jump)



"""

# Set up visualization
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

"""
%matplotlib inline → Jupyter Notebook mein graphs ko same cell ke andar display karne ke liye

sns.set_style("whitegrid") → Seaborn ke graphs ka style set karna (white grid background)

plt.style.use("fivethirtyeight") → Matplotlib style jo readable aur clean visuals provide karta hai
"""

# Download necessary NLTK resources for text processing
nltk.download('wordnet')  # WordNet for lemmatization
nltk.download('omw-1.4')  # Open Multilingual Wordnet
nltk.download('punkt')  # Tokenizer
nltk.download('stopwords')  # Stopwords for text cleaning
nltk.download('averaged_perceptron_tagger')  # POS tagger for part-of-speech tagging
nltk.download('tagsets_json')  # Tagset resource


"""
wordnet → Lemmatizer ke liye dictionary

omw-1.4 → Word meanings ka multilingual version

punkt → Tokenizer (text ko words/sentences mein todta hai)

stopwords → Common non-useful words ki list

averaged_perceptron_tagger → POS (Part-of-Speech) tagging ke liye

tagsets_json → POS tag ke explanations ke liye

"""

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets_json to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets_json.zip.


'\nwordnet → Lemmatizer ke liye dictionary\n\nomw-1.4 → Word meanings ka multilingual version\n\npunkt → Tokenizer (text ko words/sentences mein todta hai)\n\nstopwords → Common non-useful words ki list\n\naveraged_perceptron_tagger → POS (Part-of-Speech) tagging ke liye\n\ntagsets_json → POS tag ke explanations ke liye\n\n'

In [6]:
from google.colab import files
uploaded = files.upload()

Saving en_train.csv to en_train.csv


In [7]:
df_train = pd.read_csv('en_train.csv')
print(df_train.head())

                                                text    binary  \
0  #USER# #USER# #USER# #USER# You expect a man t...  Not Hope   
1  #USER# #USER# #USER# #USER# Tinubu is actually...  Not Hope   
2  it'd be nice if missguided actually had stock ...      Hope   
3  #USER# Anyway love u bubbly i know i can count...      Hope   
4  “you have a lot of people rooting for you whet...  Not Hope   

         multiclass  
0          Not Hope  
1          Not Hope  
2           Sarcasm  
3  Generalized Hope  
4          Not Hope  


In [8]:

from google.colab import files
uploaded = files.upload()

Saving en_dev.csv to en_dev.csv


In [9]:
df_test = pd.read_csv('en_dev.csv')  # It will be in the same directory after upload

print(df_test.head(10))

                                                text    binary  \
0  And you’d come to the obvious logical solution...      Hope   
1  #USER# I’m hoping his type helps quiet our wor...      Hope   
2  I aspire to be his sugar mommy...I know chance...      Hope   
3  also that jacket he wore yesterday i���m gonna...      Hope   
4  How can they visually express themselves witho...  Not Hope   
5  Here's Dave Roberts: “This is probably as much...  Not Hope   
6  Brownie smoked his first joint and got laid, b...  Not Hope   
7  #USER# #USER# Sure it would, why wouldn't it? ...  Not Hope   
8  #USER# #USER# I make over $100k &amp; have no ...  Not Hope   
9  hey guys! i���m so happy to announce that i���...  Not Hope   

         multiclass  
0  Generalized Hope  
1    Realistic Hope  
2  Unrealistic Hope  
3           Sarcasm  
4           Sarcasm  
5          Not Hope  
6          Not Hope  
7          Not Hope  
8          Not Hope  
9           Sarcasm  


In [10]:
df_test.columns

Index(['text', 'binary', 'multiclass'], dtype='object')

In [11]:

df_test['text']

Unnamed: 0,text
0,And you’d come to the obvious logical solution...
1,#USER# I’m hoping his type helps quiet our wor...
2,I aspire to be his sugar mommy...I know chance...
3,also that jacket he wore yesterday i���m gonna...
4,How can they visually express themselves witho...
...,...
1897,#USER# #USER# #USER# #USER# #USER# #USER# #USE...
1898,"I wish I was white , then probably somebody wo..."
1899,#USER# #USER# #USER# #USER# banking system shi...
1900,The secret of health for both mind and body is...


In [13]:
import re
import string

def text_process(test):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    # 3. Remove user mentions and hashtags
    text = re.sub(r'\@\w+|\#','', text)

    # 4. Remove numbers
    text = re.sub(r'\d+', '', text)

    # 5. Remove hexadecimal or addresses
    text = re.sub(r'0x[a-fA-F0-9]+', '', text)

    # 6. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 7. Remove extra spaces
    text = re.sub('\s+', ' ', text).strip()

    return text


In [14]:
df_train['clean_tweets'] = df_train['text'].apply(clean_text)
df_test['clean_tweets'] = df_test['text'].apply(clean_text)

In [15]:
df_train.head()

Unnamed: 0,text,binary,multiclass,clean_tweets
0,#USER# #USER# #USER# #USER# You expect a man t...,Not Hope,Not Hope,user user user user you expect a man that lite...
1,#USER# #USER# #USER# #USER# Tinubu is actually...,Not Hope,Not Hope,user user user user tinubu is actually a bonus...
2,it'd be nice if missguided actually had stock ...,Hope,Sarcasm,itd be nice if missguided actually had stock f...
3,#USER# Anyway love u bubbly i know i can count...,Hope,Generalized Hope,user anyway love u bubbly i know i can count o...
4,“you have a lot of people rooting for you whet...,Not Hope,Not Hope,“you have a lot of people rooting for you whet...


In [16]:
binary_mapping = {'Not Hope': 0, 'Hope': 1}


In [17]:
df_train= df_train[df_train['binary'].isin(binary_mapping.keys())].copy() # -> warning dany say bachata ha .copy() function
df_train


Unnamed: 0,text,binary,multiclass,clean_tweets
0,#USER# #USER# #USER# #USER# You expect a man t...,Not Hope,Not Hope,user user user user you expect a man that lite...
1,#USER# #USER# #USER# #USER# Tinubu is actually...,Not Hope,Not Hope,user user user user tinubu is actually a bonus...
2,it'd be nice if missguided actually had stock ...,Hope,Sarcasm,itd be nice if missguided actually had stock f...
3,#USER# Anyway love u bubbly i know i can count...,Hope,Generalized Hope,user anyway love u bubbly i know i can count o...
4,“you have a lot of people rooting for you whet...,Not Hope,Not Hope,“you have a lot of people rooting for you whet...
...,...,...,...,...
5228,naming my son onickolas in honor of my love fo...,Hope,Sarcasm,naming my son onickolas in honor of my love for
5229,The Hanging Chad of the CIA Crowley Bush Famil...,Not Hope,Not Hope,the hanging chad of the cia crowley bush famil...
5230,the entire history of human desire takes about...,Not Hope,Not Hope,the entire history of human desire takes about...
5231,#USER# #USER# Not true. I just donated to one ...,Not Hope,Not Hope,user user not true i just donated to one and d...


In [18]:
df_train['clean_tweets_new'] = df_train['binary'].map(binary_mapping)

In [19]:
df_train
df_test

Unnamed: 0,text,binary,multiclass,clean_tweets
0,And you’d come to the obvious logical solution...,Hope,Generalized Hope,and you’d come to the obvious logical solution...
1,#USER# I’m hoping his type helps quiet our wor...,Hope,Realistic Hope,user i’m hoping his type helps quiet our world...
2,I aspire to be his sugar mommy...I know chance...,Hope,Unrealistic Hope,i aspire to be his sugar mommyi know chance of...
3,also that jacket he wore yesterday i���m gonna...,Hope,Sarcasm,also that jacket he wore yesterday i���m gonna...
4,How can they visually express themselves witho...,Not Hope,Sarcasm,how can they visually express themselves witho...
...,...,...,...,...
1897,#USER# #USER# #USER# #USER# #USER# #USER# #USE...,Hope,Generalized Hope,user user user user user user user but thats t...
1898,"I wish I was white , then probably somebody wo...",Hope,Unrealistic Hope,i wish i was white then probably somebody woul...
1899,#USER# #USER# #USER# #USER# banking system shi...,Not Hope,Not Hope,user user user user banking system shiver bad ...
1900,The secret of health for both mind and body is...,Hope,Generalized Hope,the secret of health for both mind and body is...


In [20]:
train_vocab = set(' '.join(df_train['clean_tweets']).split())
train_vocab


{'cuba',
 'refuses',
 'scrubs',
 'bold',
 'recognized',
 'calculated',
 '🙌🧡',
 'arab',
 'setting',
 'flung',
 'safeguarding',
 'outright',
 'regulate',
 'delulu',
 'youu',
 'baby',
 'antigop',
 '🤗',
 'e',
 'cloth',
 'injuries',
 'californians',
 'spokane',
 'scrumptious',
 'untrue',
 'corpse',
 'jamie',
 'manufactured',
 'thrower',
 '🥳🥳',
 '“control',
 'thoughts',
 'pressuring',
 'anthology',
 'belonged',
 'drown',
 'hazel',
 'consultation',
 'solid',
 'contribute',
 'aggressiveness',
 'fearless',
 'maby',
 'murray',
 'asininity',
 'enjoyed',
 'apch',
 'academies',
 'lastinf',
 'outro',
 'weve',
 'shillbut',
 'fueled',
 'yetthis',
 'overreaching',
 'demanding',
 '“clarifying”',
 'anyones',
 'hopping',
 'rub',
 'tokens',
 'palate',
 'magic',
 'minceno',
 'bitcoin',
 'proteins',
 'governmentrelated',
 'whymaybe',
 'rejecting',
 'flow',
 'kaepernick',
 '���unwantable���',
 'forgotten',
 'agency',
 'dinosaur',
 'footballplayer',
 'allowances',
 'pain',
 'welders',
 'simultaneously',
 'spon

In [21]:
test_vocab = set(''.join(df_test['clean_tweets']).split())
test_vocab

{'scrubs',
 'steering',
 'bold',
 '😭😭😭fake',
 'setting',
 'arab',
 'outright',
 'headers',
 'michiganvpc',
 'brent',
 'ada',
 'baby',
 'e',
 'case”',
 'neurological',
 'chengdu',
 'sindhlocalgovtelections',
 'jamie',
 'urlhs',
 'thoughts',
 'clubbed',
 'solid',
 'contribute',
 'eventstrips',
 'wtic',
 'fearless',
 'expression',
 'murray',
 'enjoyed',
 '白越光',
 'weve',
 '不属光夜只属于你user',
 'demanding',
 'anyones',
 'rub',
 'rama',
 'checkedthen',
 'magic',
 '“aspire”',
 '🤣might',
 'rejecting',
 'flow',
 'eagerness',
 '—three',
 'agency',
 'tan',
 'transitiondemocrats',
 'pain',
 'cost',
 'networks',
 'suit',
 'earned',
 'sheep',
 'junk',
 'topple',
 'usfs',
 'tazwellbrown',
 'babies',
 'seamlessly',
 'forever',
 'tooto',
 'crossing',
 'daigwok',
 'cheapexpensive',
 'skins',
 'dec',
 'asaww',
 'disasteruser',
 'debates',
 'lane',
 'chivalry',
 'rawalpindi',
 'filmthe',
 'homophobic',
 'amounts',
 'intentionally',
 'profile',
 'journey',
 'urlstop',
 'urleven',
 'perpetual',
 'asslamo',
 'ele

In [22]:
print("*** Vocabulary statistics ***")
print(f"Train unique words : {len(train_vocab):,}")
print(f"Test  unique words : {len(test_vocab):,}")
print(f"Overlap             : {len(train_vocab & test_vocab):,}")  # common words
print(f"Test OOV            : {len(test_vocab - train_vocab):,}")  # out-of-vocab

*** Vocabulary statistics ***
Train unique words : 18,176
Test  unique words : 10,785
Overlap             : 5,723
Test OOV            : 5,062


In [23]:
print("Sample train words :", sorted(list(train_vocab))[:25], "...\n")
print("Sample test  words :", sorted(list(test_vocab))[:25], "...\n")

Sample train words : ['a', 'aa', 'aaa', 'aaaa', 'aaaaaaaa', 'aaaaah', 'aahhhhh', 'aami', 'aaron', 'aas', 'aau', 'aave', 'ab', 'abamfuni😂😂😂😂😂', 'abandon', 'abandoned', 'abberations', 'abbottcruzmassacre', 'abbott’s', 'abbreviation', 'abby', 'abcnl', 'abdi’s', 'aberration', 'abetting'] ...

Sample test  words : ['a', 'aaaa', 'aakhir', 'aal', 'aameen', 'aameenshe', 'aave', 'ab', 'abandon', 'abandoned', 'abbotts', 'abdirahman', 'abdul', 'abeg', 'abekministriesinternationaluser', 'abi', 'ability', 'able', 'abomination', 'abominationsmy', 'abominationsthis', 'aborted', 'aborting', 'abortion', 'abortioni'] ...



In [24]:
x = df_train['clean_tweets']
y = df_train['binary']

In [25]:
x

Unnamed: 0,clean_tweets
0,user user user user you expect a man that lite...
1,user user user user tinubu is actually a bonus...
2,itd be nice if missguided actually had stock f...
3,user anyway love u bubbly i know i can count o...
4,“you have a lot of people rooting for you whet...
...,...
5228,naming my son onickolas in honor of my love for
5229,the hanging chad of the cia crowley bush famil...
5230,the entire history of human desire takes about...
5231,user user not true i just donated to one and d...


In [26]:
y
# -> if the train column and lable are not euqual then occur the error


Unnamed: 0,binary
0,Not Hope
1,Not Hope
2,Hope
3,Hope
4,Not Hope
...,...
5228,Hope
5229,Not Hope
5230,Not Hope
5231,Not Hope


In [27]:
print("X shape:", x.shape)
print("Y shape:", y.shape)

X shape: (5233,)
Y shape: (5233,)


In [28]:
from sklearn.model_selection import train_test_split

"""
train_test_split  ->   Ye function aap ke data ko 2 parts mein divide karta hai:
Training data (model ko train karne ke liye)
Validation data (model ko test karne ke liye)
"""

x_train, x_test, y_train, y_test = train_test_split(

  x, y, test_size=0.2, random_state=42, stratify=y

)

In [29]:
print("X shape:", x.shape)
print("Y shape:", y.shape)

X shape: (5233,)
Y shape: (5233,)


In [30]:
print("X shape:", x_train.shape)
print("X shape:", y_train.shape)
print("Y shape:", x_test.shape)
print("Y shape:", y_test.shape)

X shape: (4186,)
X shape: (4186,)
Y shape: (1047,)
Y shape: (1047,)


In [31]:

# Machine learning training and testing

# 1. Countvectorization



In [39]:
vectorizer = CountVectorizer(binary=True)

"""
Yahan hum CountVectorizer bana rahay hain —
Ye Bag of Words (BoW) technique ka tool hai jo har word ko 0 ya 1 mein convert karta hai:

binary=True ka matlab:
Agar word present hai → 1
Agar word nahi hai → 0
(Frequency count nahi karta, bas presence check karta hai)

"""

x_train_dtm = vectorizer.fit_transform(x_train)

"""

fit_transform function sabse pehle unique words dhoondta hai.
X_train = [
    "I love AI",
    "AI is powerful",
    "I love machine learning"
]
['ai', 'is', 'learning', 'love', 'machine', 'powerful']

🔹 Step 2: Har sentence ko vector mein convert karna
Ab har sentence ke liye ye check karta hai ke kaun se words usmein hain, aur unka count kitna hai.

['ai' 'is' 'learning' 'love' 'machine' 'powerful']

[[1 0 0 1 0 0]    # "I love AI"
 [1 1 0 0 0 1]    # "AI is powerful"
 [0 0 1 1 1 0]]   # "I love machine learning"


"""
x_test_dtm = vectorizer.transform(x_test)
x_test_dtm.shape

(1047, 14626)

In [35]:
x_test_dtm = vectorizer.fit_transform(x_test)
x_test_dtm.shape

(1047, 6409)

In [37]:
x_test_dtm = vectorizer.transform(x_test)
x_test_dtm.shape


(1047, 6409)

In [38]:
x_train_dtm.shape

#4186	Total number of training examples (i.e., tweets ya sentences)
#14626 total number of unique words / vocablory


(4186, 14626)

In [None]:
print("\nVectorizer vocabulary size:", len(vectorizer.get_feature_names_out()))
print("Sample features:", vectorizer.get_feature_names_out()[:20])


Vectorizer vocabulary size: 14626
Sample features: ['aa' 'aaa' 'aaaa' 'aaaaah' 'aahhhhh' 'aami' 'aaron' 'aas' 'aau' 'aave'
 'abamfuni' 'abandon' 'abandoned' 'abberations' 'abbott'
 'abbottcruzmassacre' 'abbreviation' 'abby' 'abcnl' 'abdi']


In [42]:
start_time = time.time()
nb = MultinomialNB()
nb.fit(x_train_dtm, y_train)
end_time = time.time()

"""

start_time aur end_time → model training ka time measure kar rahe hain

MultinomialNB() → yeh Naive Bayes classifier hai, jo mostly text classification mein use hota hai

.fit(X_train_dtm, y_train) → model ko train karo using training tweets (vector form) and labels



"""

'\n\nstart_time aur end_time → model training ka time measure kar rahe hain\n\nMultinomialNB() → yeh Naive Bayes classifier hai, jo mostly text classification mein use hota hai\n\n.fit(X_train_dtm, y_train) → model ko train karo using training tweets (vector form) and labels\n\n\n\n'

In [43]:
print("\nTraining Time: {:.3f} seconds".format(end_time - start_time))



Training Time: 0.021 seconds


In [44]:
y_pred_class = nb.predict(x_test_dtm)
print ("accuracy score")
y_pred_class

accuracy score


array(['Hope', 'Hope', 'Not Hope', ..., 'Not Hope', 'Not Hope', 'Hope'],
      dtype='<U8')

In [45]:
y_test

Unnamed: 0,binary
1145,Not Hope
4228,Not Hope
2894,Not Hope
3570,Not Hope
4091,Hope
...,...
2358,Not Hope
3684,Not Hope
2959,Not Hope
2196,Not Hope


In [49]:

print( "======= confusion matrix ======="

)

acc = metrics.accuracy_score(y_test, y_pred_class)

# Step 3: Print result
print("Accuracy Score:", acc)
print(f"MultinomialNB Accuracy: {acc * 100:.2f}%")

Accuracy Score: 0.766953199617956
MultinomialNB Accuracy: 76.70%


In [53]:
print( "======= classification reports =======")

print(metrics.classification_report(y_test, y_pred_class ,target_names=['Hope' , 'Note Hope']))

              precision    recall  f1-score   support

        Hope       0.75      0.74      0.75       485
   Note Hope       0.78      0.79      0.78       562

    accuracy                           0.77      1047
   macro avg       0.77      0.77      0.77      1047
weighted avg       0.77      0.77      0.77      1047

