<a href="https://colab.research.google.com/github/AkashKoley012/Deep-Learning-Projects/blob/main/Amazon%20Reviews%20for%20Sentiment%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!kaggle datasets download -d bittlingmayer/amazonreviews

Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /content
100% 493M/493M [00:21<00:00, 23.3MB/s]
100% 493M/493M [00:21<00:00, 23.6MB/s]


In [2]:
import zipfile
zip_ref = zipfile.ZipFile("amazonreviews.zip", 'r')
zip_ref.extractall()
zip_ref.close()

In [3]:
import bz2
import re
import gc

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

# Read Train & Test Files

In [4]:
train_file = bz2.BZ2File('/content/train.ft.txt.bz2')
test_file = bz2.BZ2File('/content/test.ft.txt.bz2')

# Create Lists containing Train & Test sentences

In [5]:
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

del train_file, test_file

# Convert from raw binary strings to strings that can be parsed

In [6]:
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [7]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])

In [8]:
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

In [9]:
del train_file_lines, test_file_lines

In [10]:
gc.collect()

30

In [11]:
max_features = 20000
maxlen = 100

# Tokenize

In [12]:
def tokenize(text):
  text = text.lower()
  text = text.replace('.','')
  return text.split()

In [13]:
vocab = {'<UNK>':0}

for sentence in train_sentences:
  for word in tokenize(sentence):
    if word not in vocab:
      vocab[word] = len(vocab)

In [14]:
def pad_sequence(seq, max_len=100):
  return seq[:max_len] + [0] * (max_len - len(seq))

In [15]:
def vectorize(sentence):
  vector = []
  for word in tokenize(sentence):
    if word in vocab:
      vector.append(vocab[word])
    else:
      vector.append(vocab['<UNK>'])
  return pad_sequence(vector)

In [16]:
train_df = pd.DataFrame({'sentence':train_sentences, 'label':train_labels})
test_df = pd.DataFrame({'sentence':test_sentences, 'label':test_labels})

In [17]:
# train_df['sentence'] = train_df['sentence'].apply(vectorize)
# test_df['sentence'] = test_df['sentence'].apply(vectorize)

# Dataset & DataLoader

In [18]:
class ReviewsDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    feature = torch.tensor(vectorize(self.X[idx]), dtype=torch.long)
    label = torch.tensor(self.y[idx], dtype=torch.long)
    return feature, label

In [19]:
train_dataset = ReviewsDataset(train_df['sentence'], train_df['label'])
test_dataset = ReviewsDataset(test_df['sentence'], test_df['label'])

In [20]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

# Model

In [21]:
class RNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, 2)

  def forward(self, x):
    x = self.embedding(x)
    _, hidden = self.rnn(x)
    x = self.fc(hidden.squeeze(0))
    return x

# Train & Evaluation

In [22]:
lr = 0.001
epochs = 10

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=lr)
print(device)

cuda


In [24]:
for epoch in range(epochs):
  total_loss = 0
  for feature, label in train_dataloader:
    feature, label = feature.to(device), label.to(device)
    optimizer.zero_grad()
    output = model(feature)
    loss = criterion(output, label)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f'Epoch: {epoch+1}, Loss: {total_loss/len(train_dataloader)}')

KeyboardInterrupt: 

In [None]:
model.eval()

total = 0
correct = 0

with torch.no_grad():
  for feature, label in test_dataloader:
    feature, label = feature.to(device), label.to(device)
    output = model(feature)
    _, pred = torch.max(output, 1)
    total += label.size(0)
    correct += (pred == label).sum().item()

print(f'Accuracy: {correct/total}')

In [None]:
model.eval()

total = 0
correct = 0

with torch.no_grad():
  for feature, label in train_dataloader:
    feature, label = feature.to(device), label.to(device)
    output = model(feature)
    _, pred = torch.max(output, 1)
    total += label.size(0)
    correct += (pred == label).sum().item()

print(f'Accuracy: {correct/total}')