<div>
    <img src="images/emlyon.png" style="height:60px; float:left; padding-right:10px; margin-top:5px" />
    <span>
        <h1 style="padding-bottom:5px;"> Introduction to Deep Learning </h1>
        <a href="https://masters.em-lyon.com/fr/msc-in-data-science-artificial-intelligence-strategy">[DSAIS]</a> MSc in Data Science & Artificial Intelligence Strategy <br/>
         Paris | © Saeed VARASTEH
    </span>
</div>

## Lecture 08 : PyTorch Text Classification

In this notebook, we'll be working with recurrent neural network architectures in simple spam detector model.

Our goal at this implementation will be to create a RNN/LSTM model that can accurately classify and distinguish the spam emails.

<img style="width:20%" src="./images/spam.png" />

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

np.random.seed(72)
torch.manual_seed(72)

---

In [None]:
# Setup device automatically
device = "cuda" if torch.cuda.is_available() else "cpu"
device

---

### Data and Preprocessing

#### Load data

In [None]:
df = pd.read_csv('./data/spams.csv')
print(df.shape)
df.head()

In [None]:
df.iloc[2]['data']

#### Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2)
df_train.shape, df_test.shape

#### Tokenization and the Vocabulary

We are creating a dictionary that will map a word to an index.

In [None]:
idx = 1
word2idx = {'': 0}
     
for i, row in df_train.iterrows(): # loop over df rows
    tokens = row['data'].lower().split() #simple tokenization
    for token in tokens: # add new tokens to the dictionary with an index
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

print("Done")
print(len(word2idx))

#### Convert sentences

Convert the words in the sentences to their corresponding indexes:

In [None]:
X_train = []
for i, row in df_train.iterrows():
    tokens = row['data'].lower().split()
    tokens_indices = [word2idx[token] for token in tokens]
    X_train.append(tokens_indices)

X_test = []
for i, row in df_test.iterrows():
    tokens = row['data'].lower().split()
    tokens_indices = [word2idx[token] for token in tokens if token in word2idx]
    X_test.append(tokens_indices)
     
len(X_train), len(X_test)

#### Paddings

Padding the sentences with 0s and fix their lengths so that the data can be trained in batches to speed things up.

In [None]:
max_len_train = np.max([len(x) for x in X_train])
max_len_test = np.max([len(x) for x in X_test])
max_len_train, max_len_test

In [None]:
max_len = np.max([max_len_train, max_len_test])

In [None]:
for j in range(len(X_train)):
    x = X_train[j]
    pad = [0] * (max_len - len(x))
    X_train[j] = pad + x

In [None]:
for j in range(len(X_test)):
    x = X_test[j]
    pad = [0] * (max_len - len(x))
    X_test[j] = pad + x

#### To Numpys

Converting the data into numpy arrays.

In [None]:
X_train = np.array(X_train)
X_train.shape

In [None]:
X_test = np.array(X_test)
X_test.shape

#### Labels

Converting the labels into numpy arrays.

In [None]:
y_train = df_train.label.values
y_test = df_test.label.values

y_train.shape, y_test.shape

---

### Dataset

In [None]:
class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.from_numpy(data).long()
        self.labels = torch.from_numpy(labels).float()
        
    def __getitem__(self, index):
        return (self.data[index], self.labels[index])

    def __len__(self):
        return len(self.data)
    
train_dataset = MyDataset(X_train, y_train)

### Train/Validation Split

In [None]:
train_dataset, val_dataset = random_split(train_dataset, [4000, 457])

### DataLoaders

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=32)
val_loader = DataLoader(dataset=val_dataset, batch_size=32)

### Building a Model (Single-layer RNN)


<img style="width:70%; margin-top:20px;" src="./images/rnn_unrolled.png" />

In [None]:
class MyRNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.vocab_size = len(word2idx) + 1
        self.embedding_dim = 20
        self.n_layers = 1
        self.hidden_dim = 15

        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.rnn = nn.LSTM(input_size = self.embedding_dim, hidden_size = self.hidden_dim, 
                           num_layers = self.n_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_dim, 1)

    def forward(self, x):
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(device)

        out = self.emb(x)

        #out, _ = self.rnn(out, h0)
        out, _ = self.rnn(out, (h0, c0))

        #out = out[:, -1, :]
        out, _ = torch.max(out, 1)

        out = self.fc(out)
        return out
     

Accuracy function:

In [None]:
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100 
    return acc

### Model Training

In [None]:
from timeit import default_timer as timer 

In [None]:
start_time = timer() # timer start

model = MyRNN().to(device)

train_losses = []; train_accs = []
validation_losses = []; validation_accs = []

lr = 0.001
n_epochs = 15

loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(n_epochs):
    # Training Loop
    model.train()
    train_loss, train_acc = 0, 0
    
    for x_batch, y_batch in train_loader: 
            
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        y_logits = model(x_batch).squeeze() # squeeze to remove extra `1` dimensions,
        loss = loss_fn(y_logits, y_batch)
        
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()    
        optimizer.step()
        
        y_pred = torch.round(torch.sigmoid(y_logits)) # turn logits -> pred probs -> pred labels
        train_acc += accuracy_fn(y_true=y_batch, y_pred=y_pred) 
        
    train_loss /= len(train_loader)
    train_losses.append(train_loss)
    
    train_acc /= len(train_loader)
    train_accs.append(train_acc)
    
    # Validation Loop
    model.eval()
    validation_loss , validation_acc = 0, 0
    
    with torch.no_grad():
        for x_val, y_val in val_loader:

            x_val = x_val.to(device)
            y_val = y_val.to(device)

            y_logits = model(x_val).squeeze()
            val_loss = loss_fn(y_logits, y_val)
            
            validation_loss += val_loss.item()
            
            y_pred = torch.round(torch.sigmoid(y_logits)) 
            validation_acc += accuracy_fn(y_true=y_val, y_pred=y_pred) 
        
    validation_loss /= len(val_loader)
    validation_losses.append(validation_loss)
    
    validation_acc /= len(val_loader)
    validation_accs.append(validation_acc)
    
    # Print out what's happening, every epoch
    if (epoch+1) % 1 == 0:
        print(f"Epoch: {epoch+1} | Loss: {train_loss:.5f}, Accuracy: {train_acc:.2f}% | val loss: {validation_loss:.5f}, val acc: {validation_acc:.2f}%")

end_time = timer() # timer end
total_time = end_time - start_time
print(f"Train time on {device}: {total_time:.3f} seconds")

Learning curves:

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,4))
ax[0].plot(train_losses,  label="train loss"); 
ax[0].plot(validation_losses,  label="val loss"); 
ax[0].legend();

ax[1].plot(train_accs,  label="train acc");
ax[1].plot(validation_accs,  label="val acc"); 
ax[1].legend();

### Making Predications

In [None]:
y_pred = model( torch.from_numpy(X_test).long().to(device) )
y_pred = torch.round(torch.sigmoid(y_pred)).detach().numpy()

In [None]:
y_test.shape, y_pred.shape

In [None]:
from sklearn.metrics import classification_report

print( classification_report(y_test, y_pred) )

---