<a href="https://colab.research.google.com/github/CT608/Deep_Learning/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import torch
import numpy as np
import pandas as pd

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Import Data

In [None]:
!wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
!gunzip -f movie_data.csv.gz 

--2022-06-02 16:32:57--  https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz [following]
--2022-06-02 16:32:57--  https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26521894 (25M) [application/octet-stream]
Saving to: ‘movie_data.csv.gz’


2022-06-02 16:32:57 (196 MB/s) - ‘movie_data.csv.gz’ saved [26521894/26521894]



In [None]:
data_o = pd.read_csv('movie_data.csv', header=0)

In [None]:
data_o.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


# Tokenize Review

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Dataset

In [None]:
class MovieDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.review = df["review"].values
        self.sentiment = df["sentiment"].values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.sentiment)
    
    def __getitem__(self, index):
        sentiment = self.sentiment[index]
        inputs = tokenizer(self.review[index], truncation=True, padding='max_length')
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "sentiment": torch.tensor(sentiment, dtype=torch.long),
        }

In [None]:
movie_data = MovieDataset(data_o, tokenizer)

# Split Data & Dataloader

In [None]:
length = len(movie_data)
test_len = int(length*0.2)
valid_len = int(length*0.8*0.2)
train_len = length - valid_len - test_len

train_data, valid_data, test_data = torch.utils.data.random_split(movie_data, [train_len, valid_len, test_len])

In [None]:
from torch.utils.data import DataLoader

In [None]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

# Build Model

In [None]:
class RNN (torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        #packed = torch.nn.utils.rnn.pack_padded_sequence(emdedded, text_length)

        output, (hidden, cell) = self.rnn(embedded)
        hidden.squeeze(0)


        output = self.fc(hidden)

        return output


In [None]:
def compute_accuracy (model, dataloader):
    with torch.no_grad():
        correct_pred, num_sample = 0, 0
        for idx, batch_data in enumerate(dataloader):
            review = torch.transpose(batch_data["ids"], 0, 1).to(device)
            sentiment = batch_data["sentiment"].to(device)
        
            logits = model(review).squeeze(0).to(device)
            _, pred = torch.max(logits, 1)

            num_sample += sentiment.size(0)
            correct_pred += sum(pred == sentiment)

    return correct_pred / num_sample

In [None]:
model = RNN(tokenizer.vocab_size, 128, 256, 2)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
import torch.nn.functional as F

In [None]:
def train_nn (model, num_epochs, train_loader, valid_loader, optimizer):
    best_acc = 1
    for epoch in range(num_epochs):
        for idx, batch_data in enumerate(train_loader):
            review = torch.transpose(batch_data["ids"], 0, 1).to(device)
            sentiment = batch_data["sentiment"].to(device)

            logits = model(review).squeeze(0).to(device)
            loss = F.cross_entropy(logits, sentiment)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if not idx % 50:
                print("Epoch:", epoch+1, "Batch:", idx, "/", len(train_loader), "Loss:", loss)
        
        with torch.set_grad_enabled(False):
            train_acc = compute_accuracy(model, train_loader)
            valid_acc = compute_accuracy(model, valid_loader)
            print("------------------------------------------------------------")
            print("Train Accuracy:", train_acc)
            print("Validation Accuracy:", valid_acc)
            print("------------------------------------------------------------")

            if valid_acc < best_acc:
                best_acc = valid_acc
                torch.save(model, "/content/drive/MyDrive/Project/Sentiment Analysis/model.pth")


In [None]:
train_nn(model, 1, train_loader, valid_loader, optimizer)

Y
Epoch: 1 Batch: 0 / 500 Loss: tensor(0.6989, grad_fn=<NllLossBackward0>)
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Epoch: 1 Batch: 50 / 500 Loss: tensor(0.6875, grad_fn=<NllLossBackward0>)
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Epoch: 1 Batch: 100 / 500 Loss: tensor(0.6985, grad_fn=<NllLossBackward0>)


KeyboardInterrupt: ignored

In [None]:
"""

#build iterator
review_iter = iter(data_o.review) 
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in review_iter:
        yield tokenizer(text)


vocab = build_vocab_from_iterator(yield_tokens(review_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
review_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
        label_list.append(review_pipeline(_label))
        processed_text = torch.tensor(review_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)

    return label_list.to(device), text_list.to(device), offsets.to(device)

dataloader = DataLoader(data_o, batch_size=8, shuffle=False, collate_fn=collate_batch)