In [2]:
import pandas as pd

# Load the dataset from your local machine (replace with actual file path)
dataset = pd.read_csv(r"C:\Users\shamir\nlp_dataset.csv")

# Display the first few rows of the dataset
print(dataset.head())


                                             Comment Emotion
0  i seriously hate one subject to death but now ...    fear
1                 im so full of life i feel appalled   anger
2  i sit here to write i start to dig out my feel...    fear
3  ive been really angry with r and i feel like a...     joy
4  i feel suspicious if there is no one outside l...    fear


In [11]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split


In [14]:

# Check the first few rows of the dataset
print(dataset.head())

# Clean the text (removing special characters, lowering case)
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip().lower()  # Strip and convert to lowercase
    return text

# Apply text cleaning function
dataset['cleaned_text'] = dataset['Comment'].apply(clean_text)

# Tokenization and stopword removal
stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    tokens = text.split()  # Tokenize text
    return [word for word in tokens if word not in stop_words]

dataset['tokens'] = dataset['cleaned_text'].apply(tokenize_and_remove_stopwords)

# Check the processed data
print(dataset[['Comment', 'cleaned_text', 'tokens']].head())


                                             Comment Emotion  \
0  i seriously hate one subject to death but now ...    fear   
1                 im so full of life i feel appalled   anger   
2  i sit here to write i start to dig out my feel...    fear   
3  ive been really angry with r and i feel like a...     joy   
4  i feel suspicious if there is no one outside l...    fear   

                                        cleaned_text  \
0  i seriously hate one subject to death but now ...   
1                 im so full of life i feel appalled   
2  i sit here to write i start to dig out my feel...   
3  ive been really angry with r and i feel like a...   
4  i feel suspicious if there is no one outside l...   

                                              tokens  
0  [seriously, hate, one, subject, death, feel, r...  
1                   [im, full, life, feel, appalled]  
2  [sit, write, start, dig, feelings, think, afra...  
3  [ive, really, angry, r, feel, like, idiot, tru...  
4  

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Transform the cleaned text data into numerical features
X = tfidf_vectorizer.fit_transform(dataset['cleaned_text'])

# Check the shape of the resulting matrix (features)
print(X.shape)


(5937, 5000)


In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Define target variable (emotions)
y = dataset['Emotion']  # Ensure that 'label' is the column containing emotion labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# SVM Model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)


In [18]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Make predictions using Naive Bayes
nb_predictions = nb_model.predict(X_test)
print("Naive Bayes - Accuracy:", accuracy_score(y_test, nb_predictions))
print("Naive Bayes - F1 Score:", f1_score(y_test, nb_predictions, average='weighted'))

# Make predictions using SVM
svm_predictions = svm_model.predict(X_test)
print("SVM - Accuracy:", accuracy_score(y_test, svm_predictions))
print("SVM - F1 Score:", f1_score(y_test, svm_predictions, average='weighted'))

# Print Classification Report for both models
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))
print("\nSVM Classification Report:\n", classification_report(y_test, svm_predictions))


Naive Bayes - Accuracy: 0.9031986531986532
Naive Bayes - F1 Score: 0.9031322728364976
SVM - Accuracy: 0.9377104377104377
SVM - F1 Score: 0.9377871769009148

Naive Bayes Classification Report:
               precision    recall  f1-score   support

       anger       0.87      0.94      0.91       392
        fear       0.92      0.89      0.91       416
         joy       0.92      0.87      0.90       380

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188


SVM Classification Report:
               precision    recall  f1-score   support

       anger       0.92      0.94      0.93       392
        fear       0.97      0.92      0.94       416
         joy       0.93      0.95      0.94       380

    accuracy                           0.94      1188
   macro avg       0.94      0.94      0.94      1188
weighted avg       0.94      0.94      0.94      1188

