In [224]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [225]:
df= pd.read_csv("train.txt",sep=';',header= None,names=['text','emotions'])

In [272]:
df.head()
df.shape


(16000, 2)

In [227]:
df.isnull().sum()

text        0
emotions    0
dtype: int64

In [228]:
unique_emotions=df['emotions'].unique()

In [229]:
unique_emotions

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

# Label encoding

In [230]:
emotion_numbers={}
i=0
for emo in unique_emotions:
    emotion_numbers[emo]=i
    i+=1
df['emotions']=df['emotions'].map(emotion_numbers)

In [231]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


# lower case

In [232]:
df['text']=df['text'].apply(lambda x:x.lower())

# Remove Punctuation

In [233]:
import string
def remove_punc(txt):
    return txt.translate(str.maketrans('','',string.punctuation))
df['text']=df['text'].apply(remove_punc)

In [234]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


# Removing Numbers

In [235]:
def remove_numbers(txt):
    new=''
    for i in txt:
      if not i.isdigit():
          new=new+i
    return new
df['text']=df['text'].apply(remove_numbers)

In [236]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [237]:
def remove_emojis(txt):
    new = '' 
    for i in txt: 
        if i.isascii(): 
            new = new + i 
    
    return new

df['text'] = df['text'].apply(remove_emojis) 

In [238]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [239]:
import nltk

In [240]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [241]:
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\arade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [242]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [243]:
stop_words=set(stopwords.words('english'))


# Removing Stopwords

In [244]:
def remove(txt):
    words=word_tokenize(txt)
    cleaned=[]
    for i in words:
        if not i in stop_words:
            cleaned.append(i)
    return ' '.join(cleaned)
df['text']=df['text'].apply(remove)

In [245]:
df.head()

Unnamed: 0,text,emotions
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [246]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotions'], test_size=0.33, random_state=42)

In [247]:
bow_vectorizer=CountVectorizer()

In [248]:
X_train_bow=bow_vectorizer.fit_transform(X_train)
X_test_bow=bow_vectorizer.transform(X_test)

In [249]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [250]:
nb_model=MultinomialNB()


In [251]:
nb_model.fit(X_train_bow,y_train)

In [252]:
pred_bow=nb_model.predict(X_test_bow)
print(accuracy_score(y_test,pred_bow))

0.7649621212121213


In [253]:
tfidf_vectorizer=TfidfVectorizer()
X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf=tfidf_vectorizer.transform(X_test)
nb2_model=MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)
pred_tfidf=nb2_model.predict(X_test_tfidf)
print(accuracy_score(y_test,pred_tfidf))

0.6609848484848485


In [254]:
from sklearn.linear_model import LogisticRegression

In [268]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

logistic_model = LogisticRegression(
    random_state=42, 
    class_weight=class_weight_dict,
    max_iter=1000
)

In [269]:
logistic_model.fit(X_train_bow,y_train)

In [270]:
log_pred=logistic_model.predict(X_test_bow)
print(accuracy_score(y_test,log_pred))

0.8878787878787879


In [258]:
logistic_model2 = LogisticRegression(max_iter=1000)
logistic_model2.fit(X_train_tfidf,y_train)


In [259]:
log_pred_tfidf=logistic_model2.predict(X_test_tfidf)
print(accuracy_score(y_test,log_pred_tfidf))

0.8473484848484848


In [271]:
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

svm_model = svm_model = SVC(
    kernel='linear', 
    random_state=42, 
    class_weight=class_weight_dict,
    probability=True
)
svm_model.fit(X_train_tfidf, y_train)

pred_svm = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, pred_svm))

SVM Accuracy: 0.8804924242424242


In [261]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('nb', MultinomialNB()),
        ('svm', SVC(kernel='linear', probability=True, random_state=42)),
        ('lr', LogisticRegression(random_state=42, max_iter=1000))
    ],
    voting='soft'  
)

voting_clf.fit(X_train_tfidf, y_train)

pred_voting = voting_clf.predict(X_test_tfidf)
print("Voting Classifier Accuracy:", accuracy_score(y_test, pred_voting))

Voting Classifier Accuracy: 0.8765151515151515


In [262]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

pred_rf = rf_model.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, pred_rf))

Random Forest Accuracy: 0.8791666666666667


In [263]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid_search = GridSearchCV(
    LogisticRegression(random_state=42, max_iter=1000),
    param_grid,
    cv=5,
    scoring='accuracy'
)
grid_search.fit(X_train_tfidf, y_train)

print("Best parameters:", grid_search.best_params_)
best_lr = grid_search.best_estimator_
pred_best_lr = best_lr.predict(X_test_tfidf)
print("Tuned Logistic Regression Accuracy:", accuracy_score(y_test, pred_best_lr))



Best parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Tuned Logistic Regression Accuracy: 0.8977272727272727


In [265]:
models = {
    'Naive Bayes': nb2_model,
    'Naive Bayes': nb_model,
    'Logistic Regression': logistic_model2,

    'SVM': svm_model,
    'Logistic Regression': logistic_model,
    'Voting Classifier': voting_clf,
    'Random Forest': rf_model,
    'Tuned Logistic Regression': best_lr
}

for name, model in models.items():
    pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, pred)
    print(f"{name} Accuracy: {acc:.4f}")

Naive Bayes Accuracy: 0.7233
Logistic Regression Accuracy: 0.8080
SVM Accuracy: 0.8801
Voting Classifier Accuracy: 0.8765
Random Forest Accuracy: 0.8792
Tuned Logistic Regression Accuracy: 0.8977


In [266]:
from sklearn.metrics import classification_report, confusion_matrix

# For the best model, get detailed metrics
best_pred = best_lr.predict(X_test_tfidf)  # Replace with your best model
print("Classification Report:")
print(classification_report(y_test, best_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, best_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      1559
           1       0.87      0.89      0.88       665
           2       0.83      0.78      0.80       477
           3       0.83      0.77      0.80       193
           4       0.89      0.83      0.86       635
           5       0.90      0.94      0.92      1751

    accuracy                           0.90      5280
   macro avg       0.88      0.86      0.87      5280
weighted avg       0.90      0.90      0.90      5280

Confusion Matrix:
[[1460   40    9    5   17   28]
 [  33  594    4    2   15   17]
 [   7    5  370    0    3   92]
 [   5    1    1  148   24   14]
 [  29   30    3   15  526   32]
 [  28    9   60    8    4 1642]]


In [274]:
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


estimators = [
    ('lr', LogisticRegression(random_state=42, max_iter=1000)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

xgb_model = XGBClassifier(
    random_state=42,
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    use_label_encoder=False
)

print("Training Stacking Classifier...")
stacking_clf.fit(X_train_tfidf, y_train)
pred_stacking = stacking_clf.predict(X_test_tfidf)
acc_stacking = accuracy_score(y_test, pred_stacking)
print(f"Stacking Classifier Accuracy: {acc_stacking:.4f}")

print("Training XGBoost Classifier...")
xgb_model.fit(X_train_tfidf, y_train)
pred_xgb = xgb_model.predict(X_test_tfidf)
acc_xgb = accuracy_score(y_test, pred_xgb)
print(f"XGBoost Accuracy: {acc_xgb:.4f}")

print(f"\nComparison with previous best:")
print(f"Tuned Logistic Regression Accuracy: 0.8977")
print(f"Stacking Classifier Accuracy: {acc_stacking:.4f}")
print(f"XGBoost Accuracy: {acc_xgb:.4f}")

if acc_stacking > 0.8977 or acc_xgb > 0.8977:
    best_model = stacking_clf if acc_stacking > acc_xgb else xgb_model
    best_pred = pred_stacking if acc_stacking > acc_xgb else pred_xgb
    best_name = "Stacking Classifier" if acc_stacking > acc_xgb else "XGBoost"
    
    print(f"\nDetailed metrics for {best_name}:")
    print("Classification Report:")
    print(classification_report(y_test, best_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, best_pred))

Training Stacking Classifier...
Stacking Classifier Accuracy: 0.8903
Training XGBoost Classifier...


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.8792

Comparison with previous best:
Tuned Logistic Regression Accuracy: 0.8977
Stacking Classifier Accuracy: 0.8903
XGBoost Accuracy: 0.8792
