In [26]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch.optim as optim


In [27]:
import kagglehub


# Download latest version
path = kagglehub.dataset_download("kritanjalijain/amazon-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/amazon-reviews


In [28]:
import os
print(os.listdir(path))

['amazon_review_polarity_csv.tgz', 'train.csv', 'test.csv']


In [30]:
print("Dataset Loading")
try:
  train = pd.read_csv(path+'/train.csv')[:80000]
  test = pd.read_csv(path+'/test.csv')[:20000]
  print("Dataset Loaded")
except FileNotFoundError:
  print("Dataset not found")

Dataset Loading
Dataset Loaded


In [31]:
column_names=['polarity','title','text']
train.columns=column_names

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [33]:
train.shape

(80000, 3)

# Removing Punctions from data

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [35]:
train[train['title'].isnull()]

Unnamed: 0,polarity,title,text
13264,1,,Couldn't get the device to work with my networ...
26553,1,,What separates this band from Evanescence (bes...
26826,2,,Falkenbach returns with more of the Viking/Fol...
36597,2,,I returned this because I received the same on...
37346,2,,This book is a great fantasy. I love this amaz...


In [36]:
train.drop(train[train['title'].isnull()].index, inplace = True)

In [37]:
import re
def remove_punctuations(text):
  if not isinstance(text, str):
    text = str(text).lower()
  return re.sub(r'[^\w\s]', '', text.lower())
vectorized_func = np.vectorize(remove_punctuations)


# Analysing The Trend of most used words

In [38]:

train['title'] = train['title'].apply(vectorized_func)

In [39]:
train['title']

Unnamed: 0,title
0,the best soundtrack ever to anything
1,amazing
2,excellent soundtrack
3,remember pull your jaw off the floor after hea...
4,an absolute masterpiece
...,...
79995,not nearly as good the second time
79996,someone has to die dramamystery
79997,plenty of graphic sex and violence
79998,pointless


In [40]:
train['text'] = train['text'].apply(vectorized_func)

In [41]:
x_train= train.iloc[:,2].to_numpy()
y_train = train.iloc[:,0].to_numpy()

x_test = test.iloc[:, 2].to_numpy()
y_test = test.iloc[:, 0].to_numpy()

In [42]:
x_train.shape

(79995,)

In [43]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [45]:
train['title'].iloc[2]

'excellent soundtrack'

In [46]:
train

Unnamed: 0,polarity,title,text
0,2,the best soundtrack ever to anything,im reading a lot of reviews saying that this i...
1,2,amazing,this soundtrack is my favorite music of all ti...
2,2,excellent soundtrack,i truly like this soundtrack and i enjoy video...
3,2,remember pull your jaw off the floor after hea...,if youve played the game you know how divine t...
4,2,an absolute masterpiece,i am quite sure any of you actually taking the...
...,...,...,...
79995,1,not nearly as good the second time,sharon stone displays the extent of her er tal...
79996,2,someone has to die dramamystery,basic instinct is a story of a killing that le...
79997,2,plenty of graphic sex and violence,basic instinct verhoeven has crafted a sleazy ...
79998,1,pointless,why release this on dvd without the extra foot...


# **AS WE KNOW THAT WORD2VEC IS UNSUPERVISED. WE NEED TO STOP THE TRAINING WHEN DELTA LOSS DIMINISHED AFTER CERTAIN ITERATION**

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
lemma = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [47]:
lemmatized_data = []
for i in train['title']:
  ss = i.split()
  a = []
  for j in ss:
    a.append(lemma.lemmatize(j))
  lemmatized_data.append(' '.join(a))

In [48]:
lemmatized_data

['the best soundtrack ever to anything',
 'amazing',
 'excellent soundtrack',
 'remember pull your jaw off the floor after hearing it',
 'an absolute masterpiece',
 'buyer beware',
 'glorious story',
 'a five star book',
 'whisper of the wicked saint',
 'the worst',
 'great book',
 'great read',
 'oh please',
 'awful beyond belief',
 'dont try to fool u with fake review',
 'a romantic zen baseball comedy',
 'fashionable compression stocking',
 'jobst ultrasheer thigh high',
 'size recomended in the size chart are not real',
 'men ultrasheer',
 'delicious cookie mix',
 'another abysmal digital copy',
 'a fascinating insight into the life of modern japanese teen',
 'i liked this album more then i thought i would',
 'problem with charging smaller aaa',
 'work but not a advertised',
 'disappointed',
 'oh dear',
 'based on the review here i bought one and im glad i did',
 'incorrect disc',
 'happy with itbut',
 'should be titled hollywood debacle',
 'is this great tv you bet it is',
 'nothi

In [56]:
!pip uninstall gensim numpy -y
!pip install gensim

Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Found existing installation: numpy 2.3.0
Uninstalling numpy-2.3.0:
  Successfully uninstalled numpy-2.3.0
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy, gensim
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is

In [63]:
pip install numpy==1.26.3


Collecting numpy==1.26.3
  Downloading numpy-1.26.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, bu

Gensim is giving error in import in google colab but on local runtime it is running fine

In [64]:
import gensim.downloader as api

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [65]:
from gensim.models import Word2Vec


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [59]:
def get_google_word2vec_embeddings(data):
    # Load the Google News Word2Vec model
    model = api.load("word2vec-google-news-300")

    tokenized_sentences = [sentence.split() for sentence in data]
    embeddings = []

    for sentence in tokenized_sentences:
        sentence_embeddings = []
        for word in sentence:
            if word in model:
                sentence_embeddings.append(model[word])
        if sentence_embeddings:
            embeddings.append(np.mean(sentence_embeddings, axis=0))
        else:
            embeddings.append(np.zeros(300))

    return np.array(embeddings)

train_x = get_google_word2vec_embeddings(lemmatized_data)

NameError: name 'api' is not defined

In [None]:
# @title
# def word2vec_trained_from_scratch(data):
#     tokenized_sentences = [sentence.split() for sentence in data]
#     model = Word2Vec(tokenized_sentences, vector_size=300, window=5, min_count=1, workers=4)

#     embeddings = []
#     for sentence in tokenized_sentences:
#         word_vectors = [model.wv[word] for word in sentence if word in model.wv]
#         if word_vectors:
#             sentence_embedding = np.mean(word_vectors, axis=0)
#         else:
#             sentence_embedding = np.zeros(model.vector_size)
#         embeddings.append(sentence_embedding)

#     return np.array(embeddings)

# train_x = torch.tensor(word2vec_trained_from_scratch(lemmatized_data),dtype=torch.float32)

In [None]:
train_x.shape

torch.Size([10000, 300])

In [None]:
# Dataset and DataLoader
class ReviewDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = ReviewDataset(train_x,y_train)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# LSTM Model
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.RNN = nn.RNN(input_size, hidden_size)
        self.rnn2 = nn.RNN(hidden_size, 100)

        self.h2o = nn.Linear(100, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, text):
        output, hidden = self.RNN(text)
        out, hid = self.rnn2(output)

        d= self.h2o(out)

        d = self.softmax(d)
        return d


vocab_size = 00
embed_dim = 300
hidden_dim = 32
num_classes = 16

model2 = RNNClassifier(embed_dim, hidden_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model2.parameters(), lr=0.1)

# Training Loop
for epoch in range(10):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model2(batch_X.to(device))

        loss = criterion(outputs, torch.tensor(batch_y, dtype=torch.long).to(device))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")



  loss = criterion(outputs, torch.tensor(batch_y, dtype=torch.long))


Epoch 1, Loss: 1.0299
Epoch 2, Loss: 1.1594
Epoch 3, Loss: 0.6785
Epoch 4, Loss: 1.5967
Epoch 5, Loss: 0.7624
Epoch 6, Loss: 0.9336
Epoch 7, Loss: 1.3046
Epoch 8, Loss: 1.3732
Epoch 9, Loss: 0.7739
Epoch 10, Loss: 1.0269


In [None]:

# Dataset and DataLoader
class ReviewDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = ReviewDataset(train_x,y_train)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self,  embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding( embed_dim, 16)
        self.lstm = nn.LSTM(16, hidden_dim, batch_first=True, dropout=0.3)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, dropout=0.3)
        self.ru1=nn.ReLU()
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.ru = nn.ReLU()
    def forward(self, x):
        # print(x)
        # x = self.embedding(x)
        _, (hidden, _) = self.lstm(self.embedding(x))
        _, (hidden2, _) = self.lstm2(hidden)
        out = self.fc(self.ru1(hidden2[-1]))
        return self.ru(out)

vocab_size = 00
embed_dim = 300
hidden_dim = 32
num_classes = 16

model = LSTMClassifier( embed_dim, hidden_dim, num_classes).to(device)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training Loop
for epoch in range(2):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(torch.tensor(batch_X ,dtype=torch.long).to(device))

        loss = criterion(outputs, torch.tensor(batch_y, dtype=torch.float32).to(device))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")



  outputs = model(torch.tensor(batch_X ,dtype=torch.long))
  loss = criterion(outputs, torch.tensor(batch_y, dtype=torch.float32))


Epoch 1, Loss: 0.3317
Epoch 2, Loss: 0.3330


In [None]:
lemmatized_test = []
for i in train['title']:
  ss = i.lower()
  ss = re.sub(r'[^\w\s]', '', ss)
  ss = ss.split()
  a = []
  for j in ss:
    a.append(lemma.lemmatize(j))
  lemmatized_test.append(' '.join(a))

In [None]:
test_x = torch.tensor(get_google_word2vec_embeddings(lemmatized_test), dtype=torch.float32)

In [None]:
y_test =torch.tensor(np.where(y_test == 1, 0,1))

In [None]:
dataset = ReviewDataset(test_x, y_test)
data = DataLoader(dataset, batch_size=16)

In [None]:
lstm_p = []

with torch.no_grad():
    for batch_X, batch_y in data:
        output = model(torch.tensor(batch_X,dtype=torch.long).to(device))

        lstm_p.append(output.squeeze(0))

  output = model(torch.tensor(batch_X,dtype=torch.long))


In [None]:
ss = 0
for i in range(625):
    for j in range(16):
        s = 1 if lstm_p[i][j] >0.4 else 0
        if (s == y_test[i*16+j]):
            ss += 1

In [None]:
print(f"Accuracy of LSTM is {ss/y_test.shape[0]}")

Accuracy of LSTM is 0.4903


In [None]:
rnn_p = []

with torch.no_grad():
    for batch_X, batch_y in data:
        output = model(torch.tensor(batch_X,dtype=torch.long).to(device))

        rnn_p.append(output.squeeze(0))

ss = 0
for i in range(625):
    for j in range(16):
        s = 1 if rnn_p[i][j] >0.4 else 0
        if (s == y_test[i*16+j]):
            ss += 1

print(f"Accuracy of RNN is {ss/y_test.shape[0]}")

  output = model(torch.tensor(batch_X,dtype=torch.long))


Accuracy of RNN is 0.4903
