# Import data

In [1]:
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [2]:
df = pd.read_csv("./IMDB-Dataset.csv", names=["text", "label"])

In [3]:
df.head

<bound method NDFrame.head of                                                     text      label
0                                                 review  sentiment
1      One of the other reviewers has mentioned that ...   positive
2      A wonderful little production. <br /><br />The...   positive
3      I thought this was a wonderful way to spend ti...   positive
4      Basically there's a family where a little boy ...   negative
...                                                  ...        ...
49996  I thought this movie did a down right good job...   positive
49997  Bad plot, bad dialogue, bad acting, idiotic di...   negative
49998  I am a Catholic taught in parochial elementary...   negative
49999  I'm going to have to disagree with the previou...   negative
50000  No one expects the Star Trek movies to be high...   negative

[50001 rows x 2 columns]>

In [4]:
df.shape

(50001, 2)

In [5]:
df.columns

Index(['text', 'label'], dtype='object')

# Add words in to dictionary

In [6]:
class Lang:
    def __init__(self):
        self.word2index = {"SOS": 0, "EOS": 1, "UNK": 2, "PAD": 3}
        self.word2count = {"SOS":0, "EOS": 0}
        self.index2word = {0: "SOS", 1: "EOS", 2: "UNK", 3: "PAD"}
        self.n_words = 4 

    def addSentence(self, sentence):
        sentence = str(sentence)

        for word in sentence.split():
            self.addWord(word)
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [7]:
lang = Lang()
df['text'].apply(lambda row: (lang.addSentence(row)))
lang.n_words

438732

In [8]:
df["label"] = df["label"].astype(str).str.strip().map({"positive": 1, "negative": 0})

In [9]:
df['label']

0        NaN
1        1.0
2        1.0
3        1.0
4        0.0
        ... 
49996    1.0
49997    0.0
49998    0.0
49999    0.0
50000    0.0
Name: label, Length: 50001, dtype: float64

In [10]:
df = df[df["text"].str.len() < 2000].reset_index(drop=True)

In [11]:
df

Unnamed: 0,text,label
0,review,
1,One of the other reviewers has mentioned that ...,1.0
2,A wonderful little production. <br /><br />The...,1.0
3,I thought this was a wonderful way to spend ti...,1.0
4,Basically there's a family where a little boy ...,0.0
...,...,...
41576,I thought this movie did a down right good job...,1.0
41577,"Bad plot, bad dialogue, bad acting, idiotic di...",0.0
41578,I am a Catholic taught in parochial elementary...,0.0
41579,I'm going to have to disagree with the previou...,0.0


# Turn sentences to tensors

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:

class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, df, lang, print):
        self.df = df
        self.lang = lang
        self.print = print
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        
        #build pairs
        row = self.df.iloc[idx]
        input_tensor = self.tensorFromSentence(row['text'])
        return input_tensor, row['label']

    def indexesFromSentence(self, sentence):
        
        #turn sentence into word indexes
        sentence = str(sentence)
        words = sentence.split()
        indexes = [self.lang.word2index.get(w, self.lang.word2index['UNK']) for w in words]
        return indexes

    def tensorFromSentence(self, sentence):

        if self.print:
            print(sentence)

        #add a eos to the end of sentence
        indexes = []
        indexes.append(self.lang.word2index['SOS'])
        indexes+=(self.indexesFromSentence(sentence))
        indexes.append(self.lang.word2index['EOS'])
        return torch.tensor(indexes, dtype=torch.long, device=device)

In [14]:
translation = TranslationDataset(df, lang, print=True)
translation.tensorFromSentence("Hello, I am barton")


Hello, I am barton


tensor([     0, 146490,    156,   1898,      2,      1], device='cuda:0')

In [15]:
def collate_fn(batch):
        inputs, targets = zip(*batch)
        lengths = torch.tensor([len(x) for x in inputs], dtype=torch.long)

        inputs_padded = pad_sequence(inputs, padding_value=lang.word2index['PAD'],batch_first=True)
        targets = torch.tensor(targets, dtype=torch.float)
        return inputs_padded, targets, lengths

# Load and split Data

In [16]:
#dataset = TensorDataset(df['text'],df['label'])
train_text = df.sample(frac = 0.8)
test_test = df.drop(train_text.index)
train, test = TranslationDataset(train_text, lang, print=False), TranslationDataset(test_test, lang, print=False)

In [17]:
batch_size = 256
train_dataloader = DataLoader(train, 
                            batch_size=batch_size, 
                            shuffle=True, 
                            num_workers=0,
                            collate_fn=collate_fn,drop_last=True)
test_dataloader = DataLoader(test, 
                            batch_size=batch_size, 
                            shuffle=True, 
                            num_workers=0  ,
                            collate_fn=collate_fn,drop_last=True)

In [18]:
one_x, one_y, length= next(iter(train_dataloader))
print(one_x)

tensor([[     0,     64,    398,  ...,      3,      3,      3],
        [     0,    155,    137,  ...,      3,      3,      3],
        [     0,   4666,    327,  ...,      3,      3,      3],
        ...,
        [     0,    156,    412,  ...,      3,      3,      3],
        [     0,   1177,  36666,  ...,      3,      3,      3],
        [     0,     94, 379472,  ...,      3,      3,      3]],
       device='cuda:0')


In [19]:
embedding_dim = 128
hidden_size = 128
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(lang.n_words, embedding_dim, padding_idx=lang.word2index['PAD'])
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, 2)
        self.relu = nn.ReLU()

        
        #x(L, N)
    def forward(self, x, lengths):

        #out(N, L, Embedding_dim)
        out = self.embedding(x)

        #hx(num_layer, N, hidden_size)
        #hx = torch.zeros(1, x.size(0), hidden_size,device=device) 
        
        #out(L, N, hidden_size)
        #hx(num_layer, N, hidden_size)

        out = pack_padded_sequence(out, lengths.cpu(), batch_first=True, enforce_sorted=False)

        out, hx = self.rnn(out)

        #out(L, N)
        out = self.linear(hx[-1])
        return out

In [20]:
model = MLP().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

# Train

In [21]:
def train(n_epoch):
    losses = []
    n = len(train_dataloader)
    for epoch in range(n_epoch):
        model.train()
        epoch_loss = 0
        for input, output, lengths in train_dataloader:
            output = output.type(dtype=torch.long)
            input, output, lengths = input.to(device), output.to(device), lengths.to(device)
            pred_y = model(input, lengths)
            loss = criterion(pred_y, output)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        losses.append(epoch_loss / n)
        print(f"epoch {epoch}, loss {epoch_loss / n:.4f}")

    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_epoch + 1), losses, marker='o')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.show()

In [None]:
train(20)

In [23]:
#torch.save(model.state_dict(), "RNN.pth")

if you have trained model, you can remark train and load it here

In [None]:
#model = MLP().to(device)
#model.load_state_dict(torch.load("RNN.pth", weights_only=True))

<All keys matched successfully>

# Evaluation

In [None]:
epoch_loss = 0
n = len(test_dataloader)
losses = []
model.eval()
with torch.no_grad():
    for input, output, lengths in test_dataloader:
        output = output.type(dtype=torch.long)
        input, output, lengths = input.to(device), output.to(device), lengths.to(device)
        pred_y = model(input, lengths)
        loss = criterion(pred_y, output)
        epoch_loss += loss.item()
    losses.append(epoch_loss / n)
    print(f"loss {epoch_loss / n:.4f}")

loss 0.2770
