In [21]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib


# Download NLTK resources (only needed once)
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Darshan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Darshan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Darshan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Darshan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Load the dataset
df_test = pd.read_csv('emotions\\test.csv')
df_train = pd.read_csv('emotions\\training.csv') 
df_validation = pd.read_csv('emotions\\validation.csv') 

In [9]:
# checking the shape (# of rows and columns) of the datasets
print('DF_test Shape: ', df_test.shape)
print('DF_train Shape: ', df_train.shape)
print('DF_validation Shape: ', df_validation.shape)

DF_test Shape:  (2000, 2)
DF_train Shape:  (16000, 2)
DF_validation Shape:  (2000, 2)


In [10]:
# checking the info (columns, datatypes, nulls) of the datasets
print(' <<< DATASET 1 df_test -----------------------------------------------------------')
print(df_test.info())
print(' <<< DATASET 2 df_train-----------------------------------------------------------')
print(df_train.info())
print(' <<< DATASET 3 df_validation-----------------------------------------------------------')
print(df_validation.info())

 <<< DATASET 1 df_test -----------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2000 non-null   object
 1   label   2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 31.4+ KB
None
 <<< DATASET 2 df_train-----------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB
None
 <<< DATASET 3 df_validation-----------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Cou

In [11]:
df_train['text']

0                                  i didnt feel humiliated
1        i can go from feeling so hopeless to so damned...
2         im grabbing a minute to post i feel greedy wrong
3        i am ever feeling nostalgic about the fireplac...
4                                     i am feeling grouchy
                               ...                        
15995    i just had a very brief time in the beanbag an...
15996    i am now turning and i feel pathetic that i am...
15997                       i feel strong and good overall
15998    i feel like this was such a rude comment and i...
15999    i know a lot but i feel so stupid because i ca...
Name: text, Length: 16000, dtype: object

In [12]:
# Preprocess comments
def preprocess_comment(comment):
    if isinstance(comment, str):
        comment = comment.lower()
        comment = re.sub(r'http\S+', '', comment)
        comment = re.sub(r'[^a-zA-Z\s]', '', comment)
        tokens = word_tokenize(comment)
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
        cleaned_comment = ' '.join(lemmatized_tokens)
    else:
        cleaned_comment = ''
    return cleaned_comment


In [14]:
df_train['cleaned_comments'] = df_train['text'].apply(preprocess_comment)
df_test['cleaned_comments'] = df_test['text'].apply(preprocess_comment)
df_validation['cleaned_comments'] = df_validation['text'].apply(preprocess_comment)

In [15]:
print('DF_test : ', df_test.head())
print('DF_train : ', df_train.head())
print('DF_validation : ', df_validation.head())

DF_test :                                                  text  label  \
0  im feeling rather rotten so im not very ambiti...      0   
1          im updating my blog because i feel shitty      0   
2  i never make her separate from me because i do...      0   
3  i left with my bouquet of red and yellow tulip...      1   
4    i was feeling a little vain when i did this one      0   

                                    cleaned_comments  
0        im feeling rather rotten im ambitious right  
1                       im updating blog feel shitty  
2    never make separate ever want feel like ashamed  
3  left bouquet red yellow tulip arm feeling slig...  
4                            feeling little vain one  
DF_train :                                                  text  label  \
0                            i didnt feel humiliated      0   
1  i can go from feeling so hopeless to so damned...      0   
2   im grabbing a minute to post i feel greedy wrong      3   
3  i am ever fee

In [16]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features as needed
X_train = tfidf_vectorizer.fit_transform(df_train['text'])
X_val = tfidf_vectorizer.transform(df_validation['text'])
X_test = tfidf_vectorizer.transform(df_test['text'])
y_train = df_train['label']
y_val = df_validation['label']
y_test = df_test['label']

In [19]:
# Train the sentiment analysis model (Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [22]:
# Evaluate the model on the validation set
y_val_pred = model.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {validation_accuracy:.2f}')


Validation Accuracy: 0.86


In [23]:
# Save the trained model and vectorizer for later use
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']