In [8]:
import warnings

# Filter out all warnings
warnings.filterwarnings('ignore')

import pandas as pd
df=pd.read_csv("E:\\ENTRI DSML\\DSML\\nlp_dataset.csv")
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [9]:
print("Shape:", df.shape)
print("Size:",df.size)
print(df.columns)
print(df.describe(include='all'))
print(df.isnull().sum())
print(df.duplicated().sum())

Shape: (5937, 2)
Size: 11874
Index(['Comment', 'Emotion'], dtype='object')
                                                 Comment Emotion
count                                               5937    5937
unique                                              5934       3
top     i feel like a tortured artist when i talk to her   anger
freq                                                   2    2000
Comment    0
Emotion    0
dtype: int64
0


In [10]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Text cleaning function
def clean_text(text):
    text = text.lower() 
    text = re.sub(r'\b\w{1,2}\b', '', text)  
    text = re.sub(r'[^\w\s]', '', text)     
    return text
    
# Apply text cleaning
df['Cleaned_Comment'] = df['Comment'].apply(clean_text)

# Display the cleaned comments
df[['Comment', 'Cleaned_Comment']]

Unnamed: 0,Comment,Cleaned_Comment
0,i seriously hate one subject to death but now ...,seriously hate one subject death but now fe...
1,im so full of life i feel appalled,full life feel appalled
2,i sit here to write i start to dig out my feel...,sit here write start dig out feelings and...
3,ive been really angry with r and i feel like a...,ive been really angry with and feel like id...
4,i feel suspicious if there is no one outside l...,feel suspicious there one outside like the...
...,...,...
5932,i begun to feel distressed for you,begun feel distressed for you
5933,i left feeling annoyed and angry thinking that...,left feeling annoyed and angry thinking that ...
5934,i were to ever get married i d have everything...,were ever get married have everything read...
5935,i feel reluctant in applying there because i w...,feel reluctant applying there because want ...


In [11]:
# Tokenization and stopword removal
stop_words = set(stopwords.words('english'))
def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)
df['Processed_Comment'] = df['Cleaned_Comment'].apply(tokenize_and_remove_stopwords)
df['Processed_Comment'] 

0       seriously hate one subject death feel reluctan...
1                                 full life feel appalled
2       sit write start dig feelings think afraid acce...
3       ive really angry feel like idiot trusting firs...
4       feel suspicious one outside like rapture happe...
                              ...                        
5932                                begun feel distressed
5933    left feeling annoyed angry thinking center stu...
5934    ever get married everything ready offer got to...
5935    feel reluctant applying want able find company...
5936           wanted apologize feel like heartless bitch
Name: Processed_Comment, Length: 5937, dtype: object

In [12]:
#Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Processed_Comment'])

# Vocabulary for reference

print("\nVocabulary for TF-IDF:")
print(tfidf_vectorizer.get_feature_names_out())

# Print TF-IDF matrices
print("\nTF-IDF matrix:")
print(tfidf_matrix.toarray())



Vocabulary for TF-IDF:
['aac' 'aaron' 'abandon' ... 'zone' 'zonisamide' 'zumba']

TF-IDF matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and classification report for Naive Bayes
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print("\nNaive Bayes Model Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Model Accuracy: 0.9048821548821548
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

       anger       0.87      0.95      0.91       392
        fear       0.92      0.89      0.91       416
         joy       0.92      0.88      0.90       380

    accuracy                           0.90      1188
   macro avg       0.91      0.90      0.90      1188
weighted avg       0.91      0.90      0.90      1188



In [16]:
# Import SVM model
from sklearn.svm import SVC

# Train the SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and classification report for SVM
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print("\nSVM Model Accuracy:", svm_accuracy)
print("Classification Report for SVM:\n", classification_report(y_test, y_pred_svm))


SVM Model Accuracy: 0.9393939393939394
Classification Report for SVM:
               precision    recall  f1-score   support

       anger       0.92      0.94      0.93       392
        fear       0.97      0.92      0.94       416
         joy       0.93      0.96      0.94       380

    accuracy                           0.94      1188
   macro avg       0.94      0.94      0.94      1188
weighted avg       0.94      0.94      0.94      1188



In [18]:
# Display the model comparison
print("\nModel Comparison:")

# Print the accuracy of both models
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")
print(f"SVM Accuracy: {svm_accuracy:.4f}")

# Compare F1-Scores from the classification reports
print("\nBased on the classification reports, compare precision, recall, and F1-scores to determine the better performing model for emotion classification.")


Model Comparison:
Naive Bayes Accuracy: 0.9049
SVM Accuracy: 0.9394

Based on the classification reports, compare precision, recall, and F1-scores to determine the better performing model for emotion classification.


In [19]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# Evaluate Naive Bayes
nb_accuracy = accuracy_score(y_test, y_pred_nb)
nb_f1 = f1_score(y_test, y_pred_nb, average='weighted')
nb_precision = precision_score(y_test, y_pred_nb, average='weighted')
nb_recall = recall_score(y_test, y_pred_nb, average='weighted')
# Evaluate SVM
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm, average='weighted')
svm_precision = precision_score(y_test, y_pred_svm, average='weighted')
svm_recall = recall_score(y_test, y_pred_svm, average='weighted')
# Print results
print(f"Naive Bayes - Accuracy: {nb_accuracy}, F1-Score: {nb_f1}, Precision: {nb_precision}, Recall: {nb_recall}")
print(f"SVM - Accuracy: {svm_accuracy}, F1-Score: {svm_f1}, Precision: {svm_precision}, Recall: {svm_recall}")

Naive Bayes - Accuracy: 0.9048821548821548, F1-Score: 0.9047784694869365, Precision: 0.9062908560138424, Recall: 0.9048821548821548
SVM - Accuracy: 0.9393939393939394, F1-Score: 0.939438003326165, Precision: 0.940085210122097, Recall: 0.9393939393939394
