In [63]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [64]:
df = pd.read_csv('train.txt',sep = ';',header = None,names = ['text','emotion'])

In [65]:
df.head()
df.shape

(16720, 2)

In [66]:
import pandas as pd

df = df.rename(columns={'emotion': 'label'})

In [67]:
df.head()


Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,I absolutely hate everything about this situation,anger
3,I hate it when people are deliberately ignorant,anger
4,This is my most hated chore of all time,anger


In [68]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [69]:
unique_emotions=df['label'].unique()

In [70]:
unique_emotions

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

# Label encoding

In [71]:
emotion_numbers={}
i=0
for emo in unique_emotions:
    emotion_numbers[emo]=i
    i+=1
df['label']=df['label'].map(emotion_numbers)

In [72]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,I absolutely hate everything about this situation,1
3,I hate it when people are deliberately ignorant,1
4,This is my most hated chore of all time,1


# lower case

In [73]:
df['text']=df['text'].apply(lambda x:x.lower())

# Remove Punctuation

In [74]:
import string
def remove_punc(txt):
    return txt.translate(str.maketrans('','',string.punctuation))
df['text']=df['text'].apply(remove_punc)

In [75]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,i absolutely hate everything about this situation,1
3,i hate it when people are deliberately ignorant,1
4,this is my most hated chore of all time,1


# Removing Numbers

In [76]:
def remove_numbers(txt):
    new=''
    for i in txt:
      if not i.isdigit():
          new=new+i
    return new
df['text']=df['text'].apply(remove_numbers)

In [77]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,i absolutely hate everything about this situation,1
3,i hate it when people are deliberately ignorant,1
4,this is my most hated chore of all time,1


In [78]:
def remove_emojis(txt):
    new = '' 
    for i in txt: 
        if i.isascii(): 
            new = new + i 
    
    return new

df['text'] = df['text'].apply(remove_emojis) 

In [79]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,i absolutely hate everything about this situation,1
3,i hate it when people are deliberately ignorant,1
4,this is my most hated chore of all time,1


In [80]:
import nltk

In [81]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [82]:
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\arade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [83]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [84]:
stop_words=set(stopwords.words('english'))


# Removing Stopwords

In [85]:
def remove(txt):
    words=word_tokenize(txt)
    cleaned=[]
    for i in words:
        if not i in stop_words:
            cleaned.append(i)
    return ' '.join(cleaned)
df['text']=df['text'].apply(remove)

In [86]:
df.head()

Unnamed: 0,text,label
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,absolutely hate everything situation,1
3,hate people deliberately ignorant,1
4,hated chore time,1


In [87]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.33, random_state=42)

In [88]:
bow_vectorizer=CountVectorizer()

In [89]:
X_train_bow=bow_vectorizer.fit_transform(X_train)
X_test_bow=bow_vectorizer.transform(X_test)

In [90]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [91]:
nb_model=MultinomialNB()


In [92]:
nb_model.fit(X_train_bow,y_train)

In [93]:
pred_bow=nb_model.predict(X_test_bow)
print(accuracy_score(y_test,pred_bow))

0.7627763682493657


In [94]:


tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  
    max_features=10000,
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf=tfidf_vectorizer.transform(X_test)
nb2_model=MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)
pred_tfidf=nb2_model.predict(X_test_tfidf)
print(accuracy_score(y_test,pred_tfidf))

0.7426603841971728


In [95]:
from sklearn.linear_model import LogisticRegression

In [96]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

logistic_model = LogisticRegression(
    random_state=42, 
    class_weight=class_weight_dict,
    max_iter=1000
)

In [97]:
logistic_model.fit(X_train_bow,y_train)

In [98]:
log_pred=logistic_model.predict(X_test_bow)
print(accuracy_score(y_test,log_pred))

0.8811163465023559


In [99]:
logistic_model2 = LogisticRegression(max_iter=1000)
logistic_model2.fit(X_train_tfidf,y_train)


In [100]:
log_pred_tfidf=logistic_model2.predict(X_test_tfidf)
print(accuracy_score(y_test,log_pred_tfidf))

0.8664371148967017


In [101]:
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

svm_model = svm_model = SVC(
    kernel='linear', 
    random_state=42, 
    class_weight=class_weight_dict,
    probability=True
)
svm_model.fit(X_train_tfidf, y_train)

pred_svm = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, pred_svm))

SVM Accuracy: 0.881841246828561


In [102]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('nb', MultinomialNB()),
        ('svm', SVC(kernel='linear', probability=True, random_state=42)),
        ('lr', LogisticRegression(random_state=42, max_iter=1000))
    ],
    voting='soft'  
)

voting_clf.fit(X_train_tfidf, y_train)

pred_voting = voting_clf.predict(X_test_tfidf)
print("Voting Classifier Accuracy:", accuracy_score(y_test, pred_voting))

Voting Classifier Accuracy: 0.8845596230518303


In [103]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

pred_rf = rf_model.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, pred_rf))

Random Forest Accuracy: 0.8673432403044581


In [104]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid_search = GridSearchCV(
    LogisticRegression(random_state=42, max_iter=1000),
    param_grid,
    cv=5,
    scoring='accuracy'
)
grid_search.fit(X_train_tfidf, y_train)

print("Best parameters:", grid_search.best_params_)
best_lr = grid_search.best_estimator_
pred_best_lr = best_lr.predict(X_test_tfidf)
print("Tuned Logistic Regression Accuracy:", accuracy_score(y_test, pred_best_lr))



Best parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Tuned Logistic Regression Accuracy: 0.8968829285973179


In [105]:
from sklearn.metrics import classification_report, confusion_matrix

# For the best model, get detailed metrics
best_pred = best_lr.predict(X_test_tfidf)  # Replace with your best model
print("Classification Report:")
print(classification_report(y_test, best_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, best_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      1534
           1       0.91      0.90      0.91       900
           2       0.83      0.79      0.81       441
           3       0.80      0.79      0.79       243
           4       0.89      0.84      0.86       651
           5       0.89      0.94      0.92      1749

    accuracy                           0.90      5518
   macro avg       0.87      0.86      0.87      5518
weighted avg       0.90      0.90      0.90      5518

Confusion Matrix:
[[1412   34   12    4   22   50]
 [  38  811    3    3   21   24]
 [   7    2  349    1    2   80]
 [   1    8    0  191   18   25]
 [  28   26    5   31  545   16]
 [  29   10   54    9    6 1641]]


In [106]:
from sklearn.linear_model import LogisticRegression

final_model = LogisticRegression(
    C=10, 
    penalty='l1', 
    solver='liblinear',
    class_weight='balanced',  
    max_iter=1000,
    random_state=42
)

# Train and test
final_model.fit(X_train_tfidf, y_train)
pred_final = final_model.predict(X_test_tfidf)

print("Final Model Accuracy:", accuracy_score(y_test, pred_final))
print(classification_report(y_test, pred_final))

Final Model Accuracy: 0.897607828923523
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      1534
           1       0.91      0.91      0.91       900
           2       0.80      0.81      0.81       441
           3       0.78      0.81      0.79       243
           4       0.89      0.85      0.87       651
           5       0.90      0.93      0.91      1749

    accuracy                           0.90      5518
   macro avg       0.87      0.87      0.87      5518
weighted avg       0.90      0.90      0.90      5518



In [108]:
import joblib

joblib.dump(final_model, 'final_text_classifier.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

print("Final model saved successfully!")

Final model saved successfully!


In [62]:
df['label'].value_counts()

label
5    5362
0    4666
1    2675
4    1937
2    1304
3     572
Name: count, dtype: int64