In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#data gathering

In [59]:
url = "https://raw.githubusercontent.com/003shrey/NLP--Sentimental-Analysis/refs/heads/main/train.txt"
df= pd.read_csv(url, sep = ";", header= None, names=['text', 'emotions'])

In [60]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


#data cleaning

In [61]:
df.isnull().sum()

Unnamed: 0,0
text,0
emotions,0


In [62]:
uniemo= df['emotions'].unique()
emo_num={}
i=0
for emo in uniemo:
  emo_num[emo]= i
  i+=1
df['emotions']= df['emotions'].map(emo_num)

In [63]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


converting to lower text

In [64]:
df['text'] = df['text'].apply(lambda x : x.lower())

removing punctuation

In [65]:
import string

def remove_punc(txt):
  return txt.translate(str.maketrans('','',string.punctuation))

df['text'] = df['text'].apply(remove_punc)


remove numbers

In [66]:

def remove_numbers(txt):
    new = ""
    for i in txt:
        if not i.isdigit():
            new = new + i
    return new

df['text'] = df['text'].apply(remove_numbers)


remove emoji and special char

emoji has unicode uniquely

In [67]:

def remove_emojis(txt):
    new = ""
    for i in txt:
        if i.isascii():
            new += i
    return new

df['text'] = df['text'].apply(remove_emojis)

#Remove stopwords in ml

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [93]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [94]:
stop_words = set(stopwords.words('english'))
len(stop_words)


198

In [95]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

tokenize the word
#SPLIT IS NOT USED BECAUSE IT WOULD SPLIT THE PUNCTUATION TOO INTO  TOKEN THAT'S WHY NOT USED


In [96]:
def remove(text):
  words = word_tokenize(text)
  cleaned=[]
  for w in words:
    if w not in stop_words:
      cleaned.append(w)

  return " ".join(cleaned)

In [97]:
df['text']= df['text'].apply(remove)

In [99]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

#Feature extraction/ Vectorization

**What is max_features in CountVectorizer?**

max_features sets the maximum number of most important words (features) to keep when converting text into numerical vectors.

ðŸ‘‰ In simple terms:
It limits vocabulary size to the top N words.

**[Bag of words]-**
*Bag of Words is a text representation technique in NLP where text is converted into numerical features based on word frequency, while ignoring grammar, order, and context.*

In [107]:
from sklearn.feature_extraction.text import CountVectorizer
#example
document = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(document)
print(vectorizer.get_feature_names_out())
print(X.toarray())


['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


#bag of words and TI-IDF implementation

In [109]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotions'], test_size=0.20, random_state=42)


#Naive bayes and logistic reg used
**Here we use Multinomial Naive Bayes.
Multinomial Naive Bayes is a variant of the
Naive Bayes classifier specifically suited for
classification tasks where the features or input
data are discrete such as word counts or
frequencies in text classification.**

In [120]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)


pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_bow))


0.7678125


#TF-IDF

In [123]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [124]:
nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)


In [125]:
y_pred = nb2_model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))


0.6609375


#LOG REG

In [127]:
from sklearn.linear_model import LogisticRegression

In [138]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),   # unigram + bigram
    max_df=0.95,
    min_df=5,
    sublinear_tf=True
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [139]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'class_weight': [None, 'balanced']
}


In [None]:
grid = GridSearchCV(
    LogisticRegression(max_iter=1000),
    param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

grid.fit(X_train_tfidf, y_train)


In [None]:
best_model = grid.best_estimator_
print("Best Params:", grid.best_params_)

In [145]:
y_pred_best = best_model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred_best))


Accuracy: 0.901875


In [146]:
import pickle

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Model and vectorizer saved successfully")

Model and vectorizer saved successfully
