In [18]:
%cd /content/drive/MyDrive/MLProjects/Emotion_Detection_from_text

import pandas as pd
import joblib
# import spacy
import sklearn

df = pd.read_csv("Emotion_classify_Data.csv")
df.head()



/content/drive/MyDrive/MLProjects/Emotion_Detection_from_text


Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [None]:
df["Emotion"].value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [4]:
from sklearn.preprocessing import LabelEncoder

encoder= LabelEncoder()

y = encoder.fit_transform(df["Emotion"])
y

array([1, 0, 1, ..., 2, 1, 0])

In [None]:
encoder.classes_

array(['anger', 'fear', 'joy'], dtype=object)

In [None]:
labels = {l:i for i,l in enumerate(encoder.classes_)}
labels

{'anger': 0, 'fear': 1, 'joy': 2}

In [None]:

df["labeled_taget"] = y
df.head()

Unnamed: 0,Comment,Emotion,processed_comment,labeled_taget
0,i seriously hate one subject to death but now ...,fear,,1
1,im so full of life i feel appalled,anger,,0
2,i sit here to write i start to dig out my feel...,fear,,1
3,ive been really angry with r and i feel like a...,joy,,2
4,i feel suspicious if there is no one outside l...,fear,,1


In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test, y_train,y_test = train_test_split(df["Comment"], df["labeled_taget"],
                                                  test_size = 0.25,
                                                  stratify = df['labeled_taget'],
                                                  random_state = 2)

print(x_train.shape, x_test.shape)

(4452,) (1485,)


In [None]:
y_train.value_counts()

labeled_taget
0    1500
2    1500
1    1452
Name: count, dtype: int64

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


models = [
    {
        "name": "logistic_regression",
        "model": LogisticRegression(max_iter = 10000),
        "params":{
            "logistic_regression__C": [1,0.1,0.01]
        }

    },

    {
        "name": "random_forest",
        "model": RandomForestClassifier(),
        "params":{
            "random_forest__n_estimators" : [100,150,250]
        }
    },

    {
        "name": "knn",
        "model": KNeighborsClassifier(),
        "params":{
            "knn__n_neighbors" : [10, 20, 40]

        }
    },

     {
        "name": "naive_bayes",
        "model": MultinomialNB(),
        "params":{}

    }




]



In [None]:
results = {}
best_models ={}
for obj in models:
  name= obj["name"]
  model = obj["model"]
  params = obj["params"]
  pipe = Pipeline([
      ("vectorizer", TfidfVectorizer()),
      (name, model)
  ])

  clf = GridSearchCV(pipe, params, cv=5)
  clf.fit(x_train,y_train)
  results[name] = {"best_score":clf.best_score_, "best_params": clf.best_params_}
  best_models[name] = clf.best_estimator_
  print(clf.cv_results_)


{'mean_fit_time': array([1.13851533, 0.54922147, 0.26865358]), 'std_fit_time': array([0.36988439, 0.17437718, 0.0837905 ]), 'mean_score_time': array([0.08431978, 0.05951347, 0.0593255 ]), 'std_score_time': array([0.031696  , 0.01657442, 0.0192481 ]), 'param_logistic_regression__C': masked_array(data=[1, 0.1, 0.01],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'logistic_regression__C': 1}, {'logistic_regression__C': 0.1}, {'logistic_regression__C': 0.01}], 'split0_test_score': array([0.88664422, 0.79461279, 0.68574635]), 'split1_test_score': array([0.90572391, 0.78563412, 0.66666667]), 'split2_test_score': array([0.9011236 , 0.81460674, 0.6988764 ]), 'split3_test_score': array([0.89775281, 0.79662921, 0.69662921]), 'split4_test_score': array([0.89213483, 0.80786517, 0.69101124]), 'mean_test_score': array([0.89667587, 0.79986961, 0.68778597]), 'std_test_score': array([0.00669424, 0.01021661, 0.01150468]), 'rank_test_score': array(

In [None]:
results = pd.DataFrame(results)
results

Unnamed: 0,logistic_regression,random_forest,knn,naive_bayes
best_score,0.896676,0.901617,0.791334,0.881627
best_params,{'logistic_regression__C': 1},{'random_forest__n_estimators': 250},{'knn__n_neighbors': 40},{}


In [None]:
from sklearn.metrics import classification_report
for name,model in best_models.items():
  y_pred = model.predict(x_test)
  print(name)
  print(classification_report(y_test, y_pred))


logistic_regression
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       500
           1       0.94      0.88      0.91       485
           2       0.89      0.95      0.92       500

    accuracy                           0.91      1485
   macro avg       0.91      0.91      0.91      1485
weighted avg       0.91      0.91      0.91      1485

random_forest
              precision    recall  f1-score   support

           0       0.95      0.89      0.92       500
           1       0.93      0.92      0.93       485
           2       0.89      0.96      0.92       500

    accuracy                           0.92      1485
   macro avg       0.92      0.92      0.92      1485
weighted avg       0.92      0.92      0.92      1485

knn
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       500
           1       0.80      0.81      0.81       485
           2       0.86      0.72    

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
  doc =  nlp(text)
  result = []

  for token in doc:
    if token.is_punct or token.is_stop:
      continue

    result.append(token.lemma_)

  return " ".join(result)



In [None]:
df["processed_comment"] = df["Comment"].apply(preprocess)


In [None]:
df.to_csv("processed_dataset.csv", index = False)

In [None]:
df.head()

Unnamed: 0,Comment,Emotion,processed_comment,labeled_taget
0,i seriously hate one subject to death but now ...,fear,seriously hate subject death feel reluctant drop,1
1,im so full of life i feel appalled,anger,m life feel appalled,0
2,i sit here to write i start to dig out my feel...,fear,sit write start dig feeling think afraid accep...,1
3,ive been really angry with r and i feel like a...,joy,ve angry r feel like idiot trust place,2
4,i feel suspicious if there is no one outside l...,fear,feel suspicious outside like rapture happen,1


In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test, y_train,y_test = train_test_split(df["processed_comment"], df["labeled_taget"],
                                                  test_size = 0.25,
                                                  stratify = df['labeled_taget'],
                                                  random_state = 2)


In [None]:
results = {}
best_models ={}
cv_results =[]
for obj in models:
  name= obj["name"]
  model = obj["model"]
  params = obj["params"]
  pipe = Pipeline([
      ("vectorizer", TfidfVectorizer()),
      (name, model)
  ])

  clf = GridSearchCV(pipe, params, cv=5)
  clf.fit(x_train,y_train)
  results[name] = {"best_score":clf.best_score_, "best_params": clf.best_params_}
  best_models[name] = clf.best_estimator_
  cv_results.append(clf.cv_results_)


In [None]:
from sklearn.metrics import classification_report
for name,model in best_models.items():
  y_pred = model.predict(x_test)
  print(name)
  print(classification_report(y_test, y_pred))

logistic_regression
              precision    recall  f1-score   support

           0       0.92      0.91      0.91       500
           1       0.95      0.89      0.92       485
           2       0.90      0.96      0.93       500

    accuracy                           0.92      1485
   macro avg       0.92      0.92      0.92      1485
weighted avg       0.92      0.92      0.92      1485

random_forest
              precision    recall  f1-score   support

           0       0.93      0.90      0.92       500
           1       0.94      0.92      0.93       485
           2       0.91      0.95      0.93       500

    accuracy                           0.93      1485
   macro avg       0.93      0.93      0.93      1485
weighted avg       0.93      0.93      0.93      1485

knn
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       500
           1       0.84      0.88      0.86       485
           2       0.94      0.80    

In [None]:
rdf = pd.DataFrame({})
for item in cv_results:
  x = pd.DataFrame(item)
  rdf = pd.concat([rdf,x], axis=0)



rdf


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logistic_regression__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,param_random_forest__n_estimators,param_knn__n_neighbors
0,0.680657,0.092821,0.040916,0.005558,1.0,{'logistic_regression__C': 1},0.914703,0.927048,0.917978,0.907865,0.913483,0.916215,0.006324,1,,
1,0.311978,0.04937,0.034642,0.008196,0.1,{'logistic_regression__C': 0.1},0.872054,0.881033,0.883146,0.869663,0.869663,0.875112,0.005802,2,,
2,0.22162,0.112047,0.019485,0.001841,0.01,{'logistic_regression__C': 0.01},0.776655,0.780022,0.792135,0.791011,0.776404,0.783246,0.006927,3,,
0,2.726078,1.108227,0.077553,0.002353,,{'random_forest__n_estimators': 100},0.921437,0.91807,0.922472,0.91573,0.904494,0.916441,0.006437,2,100.0,
1,5.064076,1.458428,0.117204,0.017949,,{'random_forest__n_estimators': 150},0.923681,0.922559,0.921348,0.913483,0.897753,0.915765,0.009692,3,150.0,
2,8.220712,2.721538,0.200672,0.029082,,{'random_forest__n_estimators': 250},0.919192,0.929293,0.925843,0.913483,0.905618,0.918686,0.008507,1,250.0,
0,0.070194,0.031699,0.490948,0.025978,,{'knn__n_neighbors': 10},0.836139,0.832772,0.850562,0.849438,0.834831,0.840749,0.007638,3,,10.0
1,0.071949,0.017339,0.592924,0.089563,,{'knn__n_neighbors': 20},0.855219,0.845118,0.847191,0.838202,0.84382,0.84591,0.005526,1,,20.0
2,0.104893,0.0252,0.640934,0.059425,,{'knn__n_neighbors': 40},0.859708,0.855219,0.839326,0.82809,0.839326,0.844334,0.011566,2,,40.0
0,0.052044,0.001744,0.011632,0.00029,,{},0.89899,0.882155,0.895506,0.888764,0.891011,0.891285,0.005779,1,,


In [None]:
# import json


# with open("emotion_labels.json","w") as f:
#   f.write(json.dumps(labels))

In [None]:


joblib.dump(best_models["random_forest"], "emotion_detection_model.pkl")

['emotion_detection_model.pkl']

In [7]:
import spacy
