<a href="https://colab.research.google.com/github/Arun-nexus/deep_learning/blob/main/pytorch_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp "/content/drive/MyDrive/kaggle.json" ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d pawankumargunjan/imdb-review

In [None]:
import zipfile
with zipfile.ZipFile("imdb-review.zip","r") as zip:
  zip.extractall("imdb")

In [None]:
import os

In [None]:
import os
import pandas as pd

test_dir="/content/imdb/aclImdb/test"
train_dir="/content/imdb/aclImdb/train"

data = []

for sentiment in ["pos", "neg"]:
    sentiment_dir = os.path.join(test_dir, sentiment)
    for filename in os.listdir(sentiment_dir):
        filepath = os.path.join(sentiment_dir, filename)
        if os.path.isfile(filepath):
            with open(filepath, "r", encoding="utf-8") as f:
                review = f.read()
                data.append({"reviews": review, "label": sentiment})

for sentiment in ["pos", "neg"]:
    sentiment_dir = os.path.join(train_dir, sentiment)
    for filename in os.listdir(sentiment_dir):
        filepath = os.path.join(sentiment_dir, filename)
        if os.path.isfile(filepath):
            with open(filepath, "r", encoding="utf-8") as f:
                review = f.read()
                data.append({"reviews": review, "label": sentiment})


df = pd.DataFrame(data)

In [None]:
df=df[:10000]
df

In [None]:
import nltk
nltk.download("wordnet")
from nltk import WordNetLemmatizer

In [None]:
lematizer=WordNetLemmatizer()
def lemmatizer(review):
  lemmatize_row=[lematizer.lemmatize(word) for word in review.split()]
  return (lemmatize_row)
df['reviews']=df["reviews"].apply(lemmatizer)

In [None]:
review_list = [" ".join(review_list) for review_list in df["reviews"].to_list()]

In [None]:
review_list[:10]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorize = TfidfVectorizer()
review_vectors = vectorize.fit_transform(review_list)

In [None]:
review_vector=review_vectors

In [None]:
import torch
review_dense = review_vectors.todense()
tensor = torch.tensor(review_dense)


In [None]:
tensor=tensor[:1000,:50]
tensor.shape

In [None]:
y=df["label"]
Y=y.map({"pos":1,"neg":0})

In [None]:
import torch
import torch.nn as nn

In [None]:
class sentiment(nn.Module):
  def __init__(self,input_size, num_classes=1):
     super(sentiment,self).__init__()
     self.lstm = nn.LSTM(input_size, 128, 2, batch_first=True)
     self.linear = nn.Linear(128, num_classes)

  def forward(self,x):
    x = x.unsqueeze(1)

    _, (hidden, _) = self.lstm(x)
    output = self.linear(hidden[-1])
    return output

In [None]:
class earlystopping():
  def __init__(self,min_delta,patience=2):
    self.min_delta=min_delta
    self.patience=patience
    self.counter=0
    self.best_loss=float("inf")
    self.early_stop=False

  def __call__(self,val_loss):
    if val_loss < self.best_loss-self.min_delta:
      self.best_loss=val_loss
      self.counter=0
    else:
       self.counter+=1
       if self.counter>=self.patience:
        self.early_stop=True

In [None]:
device=("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

In [None]:
model=sentiment(input_size=tensor.shape[1],num_classes=1).to(device)

In [None]:
tensor=tensor.to(torch.float32)

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.amp import GradScaler,autocast

In [None]:
model_loss=nn.BCEWithLogitsLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.001)
scheduler=ReduceLROnPlateau(optimizer,mode="min",patience=2,factor=0.5)

In [None]:
from torch.utils.data import DataLoader,dataset

In [None]:
from torch.utils.data import Dataset, DataLoader

class ReviewDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

if not isinstance(Y, torch.Tensor):
    Y = torch.tensor(Y.values)
dataset = ReviewDataset(tensor, Y)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=2)

In [None]:
epochs=50
val_loss=[]
val_acc=[]
training_loss=[]
train_acc=[]
scaler=torch.amp.GradScaler("cuda")
stopper=earlystopping(min_delta=0.0004,patience=4)
for epoch in range(epochs):
  model.train()
  running_loss=0
  validation_loss=0
  correct=0
  for x,y in data_loader:
    y=y.float().unsqueeze(1)
    x,y=x.to(device),y.to(device)
    optimizer.zero_grad()
    with autocast(device_type="cuda"):
      output=model(x)
      loss=model_loss(output,y)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

    running_loss+=loss.item()
    preds = torch.sigmoid(output) > 0.5
    correct+=(preds==y).sum().item()
    validation_loss+=y.size(0)

  train_loss=running_loss/len(data_loader)
  acc=100*correct/validation_loss
  training_loss.append(train_loss)
  train_acc.append(acc)

  print(f"epoch no.{epoch+1} training_acc : {acc:.2f} training loss : {train_loss:.4f}")

  stopper(train_loss)

  if stopper.early_stop:
    print("early stopping")
    break