In [20]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Load the dataset
url = 'https://drive.google.com/uc?id=1HWczIICsMpaL8EJyu48ZvRFcXx3_pcnb'
data = pd.read_csv(url)
df = pd.read_csv(r"C:\Users\ASUS\Downloads\nlp_dataset.csv")

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [22]:
# Initialize tools for preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess the text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization (split the text into words)
    words = text.split()
    # Remove stopwords and lemmatize each word
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

# Apply preprocessing to the 'Comment' column
df['Comment'] = df['Comment'].apply(preprocess_text)

# Splitting dataset into training and testing sets
X = df['Comment']
y = df['Emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
Explanation of Techniques:
Text Cleaning: Removes unwanted characters helps to ensure that the text data is consistent and uniform for analysis.
Tokenization: Breaks the text into individual words that can be processed by machine learning models.
Stopwords Removal: Removes common words helps to focus on the meaningful parts of the text.
Lemmatization: Reduces words to their base form, minimizing dimensionality and ensuring that words with similar meanings are treated similarly.

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fitting and transforming the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transforming the test data using the same vectorizer
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
Feature Extraction Explanation:
CountVectorizer takes the text and converts it into a matrix, where each word is represented by how many times it appears in each document. However, it only counts the frequency of words without considering their importance.
TfidfVectorizer takes this a step further by assigning a weight to each word. This weight reflects not only how often a word appears in a particular document but also how rare or common it is across all documents. By doing this, it highlights words that are more unique and meaningful for classification, helping the model focus on the most important terms.

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train SVM model
svm_model = SVC(kernel='linear')  # Linear kernel works well for text data
svm_model.fit(X_train_tfidf, y_train)

# Predict with both models
nb_predictions = nb_model.predict(X_test_tfidf)
svm_predictions = svm_model.predict(X_test_tfidf)

# Evaluate the models
print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))
print("SVM Classification Report:\n", classification_report(y_test, svm_predictions))

Naive Bayes Classification Report:
               precision    recall  f1-score   support

       anger       0.88      0.92      0.90       392
        fear       0.91      0.91      0.91       416
         joy       0.92      0.88      0.90       380

    accuracy                           0.90      1188
   macro avg       0.91      0.90      0.90      1188
weighted avg       0.91      0.90      0.90      1188

SVM Classification Report:
               precision    recall  f1-score   support

       anger       0.92      0.96      0.94       392
        fear       0.97      0.92      0.94       416
         joy       0.94      0.96      0.95       380

    accuracy                           0.94      1188
   macro avg       0.94      0.95      0.94      1188
weighted avg       0.95      0.94      0.94      1188



In [None]:
Model Development Explanation
Naive Bayes: This model is straightforward and quick, making it especially useful when dealing with text data that is sparse (where most of the words don’t appear in many documents) and when the words in the text can be treated as independent of each other.
SVM: Support Vector Machines work well with complex data, which is common in text. It's particularly good when the data can’t be separated in a simple, straight line but can be split in a higher-dimensional space. This allows it to handle more intricate patterns and make more accurate predictions.