In [None]:
import pandas as pd

In [None]:
imdb_df = pd.read_pickle("../../../data/imdb_preprocessing.pkl")

In [None]:
import torch
import torch.nn as nn

In [None]:
imdb_df.iloc[0,-1]

In [None]:
embedding = nn.Embedding(num_embeddings=146, 
            embedding_dim=10,
            padding_idx=0)
text_encoded = torch.LongTensor(imdb_df.iloc[0, -1])
result = embedding(text_encoded)


In [None]:
result

In [None]:
imdb_df['label'] = imdb_df.sentiment.apply(lambda x : 1 if x == 'positive' else 0)

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:

class IMDBDataset(Dataset):
    def __init__(self, df):
        self.X = df.token.values
        self.y = df.label.values
        
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.long)
        
    def __len__(self):
        return len(self.y)

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df  = train_test_split(imdb_df, test_size=0.2, random_state=42)

train_dataset = IMDBDataset(train_df)
test_dataset = IMDBDataset(test_df)

In [None]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                          bidirectional=bidirectional, dropout=dropout,
                          batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.bidirectional = bidirectional
         

    def forward(self, text):
        x = self.embedding(text)
        x = self.dropout(x)
        _, (x, _) = self.rnn(x)
        if self.bidirectional:
            torch.cat((x[-2, :, :], x[-1, :, :]), dim=1)
        else:
            x = x[-1, :, :]
        out = self.fc(x)
        return out

In [None]:
VOCAB_SIZE = 328232
EMBEDDING_DIM = 20
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = False 
DROPOUT = 0.5
model = RNN(VOCAB_SIZE, EMBEDDING_DIM,HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL,DROPOUT )


In [None]:
import torch
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
print('Using PyTorch version:', torch.__version__, ' Device:', DEVICE)

model = model.to(DEVICE)

In [None]:
import torch
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
print('Using PyTorch version:', torch.__version__, ' Device:', DEVICE)


model = model.to(DEVICE)


import torch.optim as optim
optimizer =optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss().to(DEVICE)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)
def train(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    for text, labels in loader:
        text, labels = text.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(loader), epoch_acc / len(loader)


def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for text, labels in loader:
            text, labels = text.to(DEVICE), labels.to(DEVICE)
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(loader), epoch_acc / len(loader)


for epoch in range(5):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
    
    print(f'Epoch {epoch+1}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
