In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import nltk


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [2]:
df = pd .read_csv('emotion_sentimen_dataset.csv', encoding='utf-8')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,Emotion
0,0,i seriously hate one subject to death but now ...,hate
1,1,im so full of life i feel appalled,neutral
2,2,i sit here to write i start to dig out my feel...,neutral
3,3,ive been really angry with r and i feel like a...,anger
4,4,i feel suspicious if there is no one outside l...,neutral


In [4]:
from ydata_profiling import ProfileReport

In [5]:
profile = ProfileReport(df, title="Emotion Recognition")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:15<00:00,  5.06s/it]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
plt.pie(df.Emotion.value_counts(), labels = df.Emotion.value_counts().index)

([<matplotlib.patches.Wedge at 0x1cb16888e50>,
  <matplotlib.patches.Wedge at 0x1cb1686a490>,
  <matplotlib.patches.Wedge at 0x1cb16868c50>,
  <matplotlib.patches.Wedge at 0x1cb1686b3d0>,
  <matplotlib.patches.Wedge at 0x1cb168e9010>,
  <matplotlib.patches.Wedge at 0x1cb1685fe10>,
  <matplotlib.patches.Wedge at 0x1cb1685ec90>,
  <matplotlib.patches.Wedge at 0x1cb1685de90>,
  <matplotlib.patches.Wedge at 0x1cb1685cd10>,
  <matplotlib.patches.Wedge at 0x1cb168eb190>,
  <matplotlib.patches.Wedge at 0x1cb16852dd0>,
  <matplotlib.patches.Wedge at 0x1cb16851e50>,
  <matplotlib.patches.Wedge at 0x1cb16850e10>],
 [Text(-0.8968681975074431, 0.6368888728025872, 'neutral'),
  Text(0.5116845310976186, -0.973744802623054, 'love'),
  Text(0.7364361572609892, -0.8171057375142263, 'happiness'),
  Text(0.8620834424273901, -0.6832365170953182, 'sadness'),
  Text(0.9422538840086998, -0.5675893040487284, 'relief'),
  Text(1.0033026167654389, -0.45098099648613016, 'hate'),
  Text(1.0444542825775267, -0.345

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(df.text.str.lower(), df.Emotion, shuffle = True, test_size = 0.2, random_state = 42)

In [8]:
counter = CountVectorizer(
    stop_words=nltk.corpus.stopwords.words("english"),
    ngram_range=(1,1)
)
counter.fit_transform(df.text.str.lower())

<839555x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 7598087 stored elements in Compressed Sparse Row format>

In [9]:
X_train_bow = counter.transform(X_train)
X_test_bow = counter.transform(X_test)

In [10]:
model = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)
model.fit(X_train_bow, Y_train)
pred = model.predict(X_test_bow)

In [11]:
print("Accuracy:", metrics.accuracy_score(pred, Y_test))

Accuracy: 0.9977785850837646


In [12]:
print(metrics.classification_report(pred, Y_test))

              precision    recall  f1-score   support

       anger       0.99      0.99      0.99      2477
     boredom       0.86      1.00      0.92        18
       empty       0.97      1.00      0.99      1065
  enthusiasm       0.99      1.00      0.99      1820
         fun       0.98      1.00      0.99      1939
   happiness       1.00      0.99      1.00      5377
        hate       0.99      1.00      0.99      3000
        love       0.99      1.00      1.00      7972
     neutral       1.00      1.00      1.00    135271
      relief       0.98      1.00      0.99      3337
     sadness       0.98      1.00      0.99      3375
    surprise       0.99      1.00      0.99      1360
       worry       0.99      1.00      0.99       900

    accuracy                           1.00    167911
   macro avg       0.98      1.00      0.99    167911
weighted avg       1.00      1.00      1.00    167911



In [13]:
disp = metrics.ConfusionMatrixDisplay.from_predictions(Y_test, pred)
disp.figure_.savefig("confusion_matrix.png")  


In [14]:
def predict_emotion(text):
    input_vector = counter.transform([text.lower()])
    prediction = model.predict(input_vector)[0]
    
    # Get probability distribution
    probabilities = model.predict_proba(input_vector)[0]
    
    # Get confidence of predicted class
    predicted_index = list(model.classes_).index(prediction)
    confidence = probabilities[predicted_index] * 100  # Convert to %
    
    print(f"Input: {text}")
    print(f"Predicted Emotion: {prediction} ({confidence:.2f}%)")


In [15]:
predict_emotion("I can't stop laughing, this is so funny!")
predict_emotion("I'm extremely anxious and nervous")


Input: I can't stop laughing, this is so funny!
Predicted Emotion: fun (99.79%)
Input: I'm extremely anxious and nervous
Predicted Emotion: neutral (99.79%)


In [16]:
import joblib

# Save Logistic Regression model
joblib.dump(model, 'emotion_model_2.pkl')

# Save CountVectorizer
joblib.dump(counter, 'count_vectorizer_2.pkl')


['count_vectorizer_2.pkl']

In [17]:
model = joblib.load('emotion_model_2.pkl')
counter = joblib.load('count_vectorizer_2.pkl')

predict_emotion("I'm happy to see you!")


Input: I'm happy to see you!
Predicted Emotion: happiness (99.18%)


In [19]:
import joblib
import h5py
import io

# Serialize model and vectorizer using joblib into memory
model_bytes = io.BytesIO()
vectorizer_bytes = io.BytesIO()

joblib.dump(model, model_bytes)
joblib.dump(counter, vectorizer_bytes)

# Go back to beginning of buffers
model_bytes.seek(0)
vectorizer_bytes.seek(0)

# Save to .h5 using h5py
with h5py.File("emotion_model.h5", "w") as f:
    f.create_dataset("model", data=np.void(model_bytes.read()))
    f.create_dataset("vectorizer", data=np.void(vectorizer_bytes.read()))


In [20]:
with h5py.File("emotion_model.h5", "r") as f:
    model_loaded = joblib.load(io.BytesIO(f["model"][()].tobytes()))
    vectorizer_loaded = joblib.load(io.BytesIO(f["vectorizer"][()].tobytes()))


In [21]:
predict_emotion("I'm happy to see you!")

Input: I'm happy to see you!
Predicted Emotion: happiness (99.18%)
