In [32]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from torch.utils.data import (DataLoader,)  # Gives easier dataset managment and creates mini batches
#import torchvision  # torch package for vision related things
#import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
#import torchvision.transforms as transforms
import pandas as pd
import random
from tqdm.auto import tqdm
from transformers import DistilBertTokenizer

In [33]:
# Run when executing on Google Colab
#from google.colab import drive
#drive.mount('/content/drive')

#!cp "/content/drive/MyDrive/Colab Notebooks/Deep Learning/DeepLProject/SpamDataset.zip" "./SpamDataset.zip"
#!unzip SpamDataset.zip

In [34]:
# Split the downloaded Youtube01-Psy.csv file into train, validation and test sets
data = pd.read_csv('SpamDataset/Youtube01-Psy.csv')
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the data
train_data = data[:int(0.7*len(data))]
val_data = data[int(0.7*len(data)):int(0.85*len(data))]
test_data = data[int(0.85*len(data)):]

# Tokenize the data using distillbert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Turn pd dataframe into a tokenized PyTorch dataset that DataLoader can use via the SpamDataset class
class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        comment_tokens = row['CONTENT']
        label = row['CLASS']
        return comment_tokens, label
    
    def collate_fn(self, batch):
        comments = [row[0] for row in batch]
        labels = [row[1] for row in batch]
        tokens = tokenizer.batch_encode_plus(comments, truncation=True, padding='max_length', max_length=30, return_tensors='pt')
        return tokens, torch.tensor(labels)
    
train_dataset = SpamDataset(train_data)
val_dataset = SpamDataset(val_data)
test_dataset = SpamDataset(test_data)


# Load Data and collate it
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=val_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=test_dataset.collate_fn)

In [35]:
# Print a random sample from the train_loader to confirm correct loading and tokenization
sample = next(iter(train_loader))
print(sample)
print([samples.size() for samples in sample[0]['input_ids']])


({'input_ids': tensor([[  101,  8299,  1024,  ...,  9413, 21397,   102],
        [  101,  2065,  1045,  ...,     0,     0,     0],
        [  101,  2074,  9361,  ...,     0,     0,     0],
        ...,
        [  101,  3531,  4942,  ...,  4502, 28646,   102],
        [  101,  2027,  2056,  ...,     0,     0,     0],
        [  101,  4942,  2026,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}, tensor([1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1]))
[torch.Size([30]), torch.Size([30]), torch.Size([30]), torch.Size([30]), torch.Size([30]), torch.Size([30]), torch.Size([30]), torch.Size([30]), torch.Size([30]), tor

In [36]:
# Recurrent neural network with LSTM (many-to-one) for sequence classification to produce a binary output
## The network includes an Embedding layer, an LSTM layer and a Linear layer
class RNN_LSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, num_classes, dropout):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (batch_size, max_seq_length)
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output, (h_n, c_n) = self.lstm(embedding)
        # output shape: (batch_size, max_seq_length, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size)
        out = self.fc(h_n[-1, :, :])
        # out shape: (batch_size, num_classes)
        return out

In [37]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set the hyperparameters for the LSTM model
input_size = tokenizer.vocab_size
embedding_size = 128
hidden_size = 256
num_layers = 2
num_classes = 2
dropout = 0.5
learning_rate = 0.001
num_epochs = 5


# init model
model_LSTM = RNN_LSTM(input_size, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer_LSTM = optim.Adam(model_LSTM.parameters(), lr=learning_rate)

# RNN_LSTM
print('RNN_LSTM Training')
for epoch in range(num_epochs):
    model_LSTM.train()
    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    for batch_idx, (data, targets) in loop:
        data = {key: value.to(device) for key, value in data.items()}
        targets = targets.to(device)

        # forward
        scores = model_LSTM(data['input_ids'])
        loss = criterion(scores, targets)

        # backward
        optimizer_LSTM.zero_grad()
        loss.backward()

        # gradient descent or adam step
        optimizer_LSTM.step()

        # update progress bar
        loop.set_description(f'Epoch [{epoch}/{num_epochs}]')
        loop.set_postfix(loss=loss.item())

RNN_LSTM Training


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [38]:
# Store all the predictions in model_predictions and the actual labels in actual_labels
original_comments = []
tokens = []
model_predictions = []
actual_labels = []

# Validation
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for data, targets in loader:
            data = {key: value.to(device) for key, value in data.items()}
            targets = targets.to(device)

            scores = model(data['input_ids'])
            _, predictions = scores.max(1)
            # Store decoded original comments, model predictions and actual labels 
            original_comments.extend([tokenizer.decode(data['input_ids'][i]) for i in range(len(data['input_ids']))])
            tokens.extend(data['input_ids'])
            model_predictions.extend(predictions)
            actual_labels.extend(targets)
            num_correct += (predictions == targets).sum()
            num_samples += predictions.size(0)

    model.train()
    return num_correct/num_samples


In [39]:
# Check accuracy on training & validation to see how good our model
train_acc = check_accuracy(train_loader, model_LSTM)
val_acc = check_accuracy(val_loader, model_LSTM)
print(f'Train accuracy: {train_acc}')
print(f'Validation accuracy: {val_acc}')
print(f'Test accuracy: {check_accuracy(test_loader, model_LSTM)}')

# Print the original comments, model predictions and actual labels
for i in range(5):
    print(f'Comment: {original_comments[i]}')
    print(f'Tokens: {tokens[i]}')
    print(f'Prediction: {model_predictions[i]}')
    print(f'Actual: {actual_labels[i]}')
    print('')


Train accuracy: 0.6516393423080444
Validation accuracy: 0.5849056839942932
Test accuracy: 0.6792452931404114
Comment: [CLS] can we reach 3 billion views by december 2014? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Tokens: tensor([ 101, 2064, 2057, 3362, 1017, 4551, 5328, 2011, 2285, 2297, 1029,  102,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0])
Prediction: 0
Actual: 0

Comment: [CLS] follow 4 follow @ vaahidmustafic like 4 like [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Tokens: tensor([  101,  3582,  1018,  3582,  1030, 12436,  4430,  3593,  7606,  2696,
         8873,  2278,  2066,  1018,  2066,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
Prediction: 0
Actual: 1

Comment: [CLS] lol [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD