In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("C:\\Users\\ajeeth\\Downloads\\emotion_dataset_raw.csv")

In [None]:
df.head()

EDA

In [None]:
df.isnull().sum()

In [None]:
df['Emotion'].value_counts(ascending=False)

In [None]:
sns.countplot(x='Emotion',data=df,order=df['Emotion'].value_counts().index)
plt.show()

Sentiment Analysis

In [None]:
from textblob import TextBlob

In [None]:
def sentiment_calculator(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        result = 'positive'
    elif sentiment <0:
        result = 'negative'
    else:
        result = 'neutral'
    return result        

In [None]:
df['sentiment'] = df['Text'].apply(sentiment_calculator)

In [None]:
df.head()

In [None]:
df.groupby(['Emotion','sentiment']).count()

In [None]:
# visualising the emotion and sentiment
plt.figure(figsize = (8,5))
sns.countplot(x='Emotion',data=df,hue='sentiment') 

Text cleaning

In [None]:
import neattext.functions as nfx

In [None]:
#dir(nfx)

In [None]:
#removing punctuations
df['Clean_Text'] = df['Text'].apply(nfx.remove_punctuations)

In [None]:
#removing usernames
df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_userhandles)

In [None]:
#removing stopwords
df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)

In [None]:
#removing hashtags
df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_hashtags)

In [None]:
#removing emojis
df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_emojis)

In [None]:
df[['Text','Clean_Text']].head()

In [None]:
df['Emotion'].unique()

In [None]:
#converting the clean_text columnn(joy) into a list
joy_list = df[df['Emotion']=='joy']['Clean_Text'].tolist()

In [None]:
#joy_list

In [None]:
#converting a list to a doc
joy_docs = ''.join(joy_list)

In [None]:
#joy_docs

In [None]:
#way of approach 1:
joy_emotion_keywords = {}
for i in joy_docs.split():
    if i not in joy_emotion_keywords:
        joy_emotion_keywords[i] = 1
    else:
        joy_emotion_keywords[i] += 1

In [None]:
common_joy_emotion_keywords = dict(sorted(joy_emotion_keywords.items(), key=lambda item: item[1], reverse=True)[:50])

print(common_joy_emotion_keywords)

In [None]:
#way of approach 2:
#function for extracting keywords in an emotion
from collections import Counter
def extract_keywords(text):
    tokens = [i for i in text.split()]
    x = Counter(tokens).most_common(50)
    return dict(x)

In [None]:
joy_keyword = extract_keywords(joy_docs)

In [None]:
joy_keyword

In [None]:
#function for visualising keywords in an emotion
def plotting_keywords(dic,title):
    df_1 = pd.DataFrame(data=dic.items(),columns = ['keyword','count'])
    plt.figure(figsize = (20,10))
    sns.barplot(x='keyword',y='count',data = df_1)
    plt.title(f'50 common keywords of {title}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
#visualising joy keywords
plotting_keywords(joy_keyword,'joy')

In [None]:
#pip install wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
#function for wordcloud
def wordcloud_keyword(text,title):
    wc = WordCloud().generate(text)
    plt.figure(figsize=(10,5))
    plt.title(f'wordcloud of {title}')
    plt.imshow(wc,interpolation='bilinear')
    plt.axis('off')

In [None]:
wordcloud_keyword(joy_docs,'joy')

In [None]:
df.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#setting features and labels
X = df['Clean_Text']
y = df['Emotion']

In [None]:
#vectorize
vect = CountVectorizer()
X = vect.fit_transform(X)

In [None]:
X

In [None]:
#vect.get_feature_names()

In [None]:
#convert sparse matrix to dense matrix
X = X.toarray()

In [None]:
#data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
#X_train

In [None]:
#Naive bayes model
nv_model = MultinomialNB()

In [None]:
nv_model.fit(X_train,y_train)

In [None]:
pred_nv_model = nv_model.predict(X_test)

In [None]:
nv_model.score(X_test,y_test)

In [None]:
#sample coding
#sample1 = ["I love coding"]

In [None]:
#vgf = vect.transform(sample1)

In [None]:
#vgf = vgf.toarray()

In [None]:
#nv_model.predict(vgf)

In [None]:
#nv_model.predict_proba(vgf)

In [None]:
#nv_model.classes_

In [None]:
#function for predicting user customised texts
def predict_emotion(samp_txt,model):
    samp_txt = vect.transform(samp_txt)
    samp_txt = samp_txt.toarray()
    prediction = model.predict(samp_txt)
    prediction_probablity = model.predict_proba(samp_txt)
    all_pred_prob = dict(zip(model.classes_,prediction_probablity[0]))
    print(f'{prediction[0]} : {np.max(model.predict_proba(samp_txt))}')
    return all_pred_prob

In [None]:
samp_txt = ["Wow i didn't expect you"]
predict_emotion(samp_txt,nv_model)

In [None]:
#logistic regression model
lr_model = LogisticRegression()

In [None]:
lr_model.fit(X_train,y_train)

In [None]:
pred_lr_model = lr_model.predict(X_test)

In [None]:
#checking accuracy
lr_model.score(X_test,y_test)

In [None]:
#samp_txt

In [None]:
predict_emotion(samp_txt,lr_model)

Model Evaluation

In [None]:
#Classification report for naive bayes
print(classification_report(y_test,pred_nv_model))

In [None]:
#confusion matrix plot for naive bayes
plot_confusion_matrix(nv_model,X_test,y_test)
plt.xticks(rotation=45)
plt.show()

In [None]:
#classification report for logistic reg
print(classification_report(y_test,pred_lr_model))

In [None]:
#confusion matrix plot for logistic reg
plot_confusion_matrix(lr_model,X_test,y_test)
plt.xticks(rotation=45)
plt.show()

Serialize and Save Model

In [None]:
import joblib

In [None]:
#saving the naive bayes model using pickle modue
model_save = open("Text_Classification_sentiment_analysis_nv_model.pkl","wb")
joblib.dump(nv_model,model_save)
model_save.close()

In [None]:
#saving the logistic reg model using pickle modue
model_save = open("Text_Classification_sentiment_analysis_lr_model.pkl","wb")
joblib.dump(lr_model,model_save)
model_save.close()

Emotion Detection in Text: An End-to-End NLP Pipeline Using Logistic Regression

Objective:
The primary goal of this project is to develop a robust and efficient pipeline for detecting emotions in text using Natural Language Processing (NLP) techniques and machine learning. By leveraging logistic regression, naive bayes the project aims to classify text into various emotional categories such as joy, sadness, anger, fear, and others.

Dataset:
The project utilizes a dataset containing text samples labeled with corresponding emotions. The dataset is loaded and inspected to understand the distribution of emotions and prepare for subsequent processing.

Methodology:

Data Exploration and Visualization:

Load the dataset using pandas and explore the distribution of emotions using value counts.
Visualize the data distribution using seaborn to understand the frequency of each emotion category.

Data Cleaning:

Employ the neattext library to clean the text data by removing user handles, stopwords, and other irrelevant components.
Create a new column Clean_Text in the dataframe to store the cleaned text.

Feature Engineering:

Define features (cleaned text) and labels (emotions) for the model.
Split the data into training and test sets using train_test_split from sklearn.

Model Building:

Construct a machine learning pipeline using sklearn's Pipeline.
Integrate CountVectorizer for text vectorization and LogisticRegression for classification.
Train the logistic regression model on the training data.

Model Evaluation:

Evaluate the model’s performance on the test set using accuracy score and classification report.
Assess model predictions and prediction probabilities for individual text samples.

Model Saving:

Save the trained model pipeline using joblib for future use and deployment.
