<a href="https://colab.research.google.com/github/Ankur7470/nlp-assignment2/blob/main/NLP_assgn2_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [207]:
!pip install gensim



## Importing the necessary libraries

In [208]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
import torch
from torch.utils.data import Dataset, DataLoader

## EDA

### Loading the Sentiment Analysis Dataset

In [209]:
sentiment_data = pd.read_csv("sentimentdataset.csv")

### Information about the Dataset

In [210]:
sentiment_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [211]:
sentiment_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  732 non-null    int64  
 1   Unnamed: 0    732 non-null    int64  
 2   Text          732 non-null    object 
 3   Sentiment     732 non-null    object 
 4   Timestamp     732 non-null    object 
 5   User          732 non-null    object 
 6   Platform      732 non-null    object 
 7   Hashtags      732 non-null    object 
 8   Retweets      732 non-null    float64
 9   Likes         732 non-null    float64
 10  Country       732 non-null    object 
 11  Year          732 non-null    int64  
 12  Month         732 non-null    int64  
 13  Day           732 non-null    int64  
 14  Hour          732 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 85.9+ KB


In [212]:
sentiment_data = sentiment_data.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Retweets', 'Likes', 'Country', 'Year', 'Month', 'Day', 'Hour'])

In [213]:
print("\nNo. of Unique Sentiments:", sentiment_data['Sentiment'].nunique())


No. of Unique Sentiments: 279


In [214]:
sentiment_data['Sentiment'].unique()

array([' Positive  ', ' Negative  ', ' Neutral   ', ' Anger        ',
       ' Fear         ', ' Sadness      ', ' Disgust      ',
       ' Happiness    ', ' Joy          ', ' Love         ',
       ' Amusement    ', ' Enjoyment    ', ' Admiration   ',
       ' Affection    ', ' Awe          ', ' Disappointed ',
       ' Surprise     ', ' Acceptance   ', ' Adoration    ',
       ' Anticipation ', ' Bitter       ', ' Calmness     ',
       ' Confusion    ', ' Excitement   ', ' Kind         ',
       ' Pride        ', ' Shame        ', ' Confusion ', ' Excitement ',
       ' Shame ', ' Elation       ', ' Euphoria      ', ' Contentment   ',
       ' Serenity      ', ' Gratitude     ', ' Hope          ',
       ' Empowerment   ', ' Compassion    ', ' Tenderness    ',
       ' Arousal       ', ' Enthusiasm    ', ' Fulfillment  ',
       ' Reverence     ', ' Compassion', ' Fulfillment   ', ' Reverence ',
       ' Elation   ', ' Despair         ', ' Grief           ',
       ' Loneliness     

In [215]:
# Stripping the leading and trailing spaces from each sentiment
sentiment_data['Sentiment'] = [sentiment.strip() for sentiment in sentiment_data['Sentiment']]

In [234]:
print("\nNo. of Unique Sentiments after stripping the unwanted spaces from sentiment labels :", sentiment_data['Sentiment'].nunique())


No. of Unique Sentiments after stripping the unwanted spaces from sentiment labels : 191


In [235]:
sentiment_counts = sentiment_data['Sentiment'].value_counts()
print("Sentiment Counts:\n")
sentiment_counts[:10]

Sentiment Counts:



Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Positive,45
Joy,44
Excitement,37
Contentment,19
Neutral,18
Gratitude,18
Curiosity,16
Serenity,15
Happy,14
Nostalgia,11


## Data Preprocessing

### Encoding the sentiment labels

In [236]:
label_encoder = LabelEncoder()
sentiment_data['Encoded Sentiment'] = label_encoder.fit_transform(sentiment_data['Sentiment'])

### Applying Preprocessing to all the texts

In [219]:
nltk.download('stopwords')
nltk.download('punkt_tab')

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

sentiment_data['Tokenized Text'] = [preprocess_text(text) for text in sentiment_data['Text'].values]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Dataset after Preprocessing/Tokenization of texts and Label Encoding of Sentiments

In [237]:
sentiment_data.head(10)

Unnamed: 0,Text,Sentiment,Encoded Sentiment,Tokenized Text
0,Enjoying a beautiful day at the park! ...,Positive,146,"[enjoying, beautiful, day, park]"
1,Traffic was terrible this morning. ...,Negative,134,"[traffic, terrible, morning]"
2,Just finished an amazing workout! 💪 ...,Positive,146,"[finished, amazing, workout]"
3,Excited about the upcoming weekend getaway! ...,Positive,146,"[excited, upcoming, weekend, getaway]"
4,Trying out a new recipe for dinner tonight. ...,Neutral,135,"[trying, new, recipe, dinner, tonight]"
5,Feeling grateful for the little things in lif...,Positive,146,"[feeling, grateful, little, thing, life]"
6,Rainy days call for cozy blankets and hot coc...,Positive,146,"[rainy, day, call, cozy, blanket, hot, cocoa]"
7,The new movie release is a must-watch! ...,Positive,146,"[new, movie, release, mustwatch]"
8,Political discussions heating up on the timel...,Negative,134,"[political, discussion, heating, timeline]"
9,Missing summer vibes and beach days. ...,Neutral,135,"[missing, summer, vibe, beach, day]"


### Extracting the Relevant Columns

In [238]:
tokenized_texts = sentiment_data['Tokenized Text'].values
labels = sentiment_data['Encoded Sentiment'].values

### Word2Vec Training & Text Vectorization

#### Train custom Word2Vec model

In [239]:
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=2, workers=4)
word2vec_model.train(tokenized_texts, total_examples=len(tokenized_texts), epochs=10)



(43772, 59130)

#### Word2Vec vocabulary details

In [240]:
print("Word2Vec vocabulary size:", len(word2vec_model.wv))
print("Sample words in vocabulary:", list(word2vec_model.wv.index_to_key)[:10])

Word2Vec vocabulary size: 1027
Sample words in vocabulary: ['new', 'life', 'day', 'dream', 'joy', 'like', 'feeling', 'moment', 'friend', 'heart']


In [241]:
word2vec_model.wv['dream']

array([-0.03776449,  0.06663589,  0.03107122,  0.00385397,  0.01218007,
       -0.08882301,  0.01114955,  0.1223098 , -0.01453634, -0.01174959,
       -0.02813914, -0.05946071, -0.00828109,  0.0052748 ,  0.01706089,
       -0.02933051,  0.00331704, -0.04709942, -0.00523312, -0.07258009,
        0.03416926,  0.02545074,  0.04295221, -0.00579173, -0.04702342,
        0.0152667 , -0.03629227, -0.06684706, -0.02573487,  0.01998695,
        0.0670318 ,  0.0137478 ,  0.01034959, -0.01098817, -0.01879746,
        0.05905052, -0.01082304, -0.06472714, -0.05223083, -0.09132703,
        0.01372566, -0.02833515,  0.00105779, -0.0117861 ,  0.04080939,
       -0.0026526 , -0.0309412 , -0.01402195,  0.01981399,  0.03195868,
        0.03573231, -0.04067149, -0.02924465,  0.00075637, -0.05676382,
        0.03763752,  0.03466801,  0.0071496 , -0.0693005 ,  0.0028235 ,
        0.02253941,  0.04051591, -0.02100565, -0.01081016, -0.05417923,
        0.01658891,  0.02915175,  0.0393888 , -0.06346095,  0.04

#### Converting the tokenized texts to Word vectors

In [242]:
max_seq_length = max(len(seq) for seq in tokenized_texts)

In [243]:
max_seq_length

16

In [244]:
def text_to_vectors(text, model, vector_size=100, max_length=max_seq_length):
    vectors = [model.wv[word] for word in text if word in model.wv]

    # Converting to NumPy array before padding
    vectors = np.array(vectors, dtype=np.float32) if len(vectors) > 0 else np.zeros((1, vector_size), dtype=np.float32)

    # Padding if length of the sentence is less than max_seq_len
    if len(vectors) < max_length:
        pad_length = max_length - len(vectors)
        padding = np.zeros((pad_length, vector_size), dtype=np.float32)
        vectors = np.vstack((vectors, padding))

    return vectors

In [245]:
vectorized_texts = np.array([text_to_vectors(text, word2vec_model) for text in tokenized_texts], dtype=np.float32)

In [246]:
vectorized_texts.shape

(732, 16, 100)

In [247]:
print("First Sentence in Tokenized form:\n", tokenized_texts[0])
print("\nFirst Sentence in Vectorized form:\n", vectorized_texts[0])  # Printing the first 5 word vectors from the first sentence

First Sentence in Tokenized form:
 ['enjoying', 'beautiful', 'day', 'park']

First Sentence in Vectorized form:
 [[-0.00158684  0.01089114  0.00852786 ... -0.01536944  0.00256202
  -0.00788842]
 [-0.00456326  0.01580343 -0.00268339 ... -0.01523939  0.00028572
  -0.00591786]
 [-0.02704431  0.05234573  0.02247882 ... -0.05673891  0.02941835
  -0.03300359]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


### Preparing Data for Training

In [248]:
X = vectorized_texts
y = labels

In [249]:
class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [250]:
# DataLoader for training
batch_size = 64
dataset = SentimentDataset(X, y)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)