In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from torch.utils.data import (DataLoader,)  # Gives easier dataset managment and creates mini batches
#import torchvision  # torch package for vision related things
#import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
#import torchvision.transforms as transforms
import pandas as pd
import random
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from tqdm.auto import tqdm
from transformers import DistilBertTokenizer

# Data Loading
Grabbing the dataset with the like count included

In [2]:
# Split the downloaded Youtube01-Psy.csv file into train, validation and test sets
data = pd.read_csv('SpamDataset/TSwift_ShakeItOff_Spam.csv')
# data = pd.read_csv('TSwift_ShakeItOff_Spam.csv')
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the data
train_data = data[:int(0.7*len(data))]
val_data = data[int(0.7*len(data)):int(0.85*len(data))]
test_data = data[int(0.85*len(data)):]

# Tokenize the data using distillbert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Turn pd dataframe into a tokenized PyTorch dataset that DataLoader can use via the SpamDataset class
class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        comment_tokens = row['CONTENT']
        like_count = row['LIKE_COUNT']
        label = row['CLASS']
        return comment_tokens, label, like_count
    
    def collate_fn(self, batch):
        comments = [row[0] for row in batch]
        labels = [row[1] for row in batch]
        like_counts = [row[2] for row in batch]
        tokens = tokenizer.batch_encode_plus(comments, truncation=True, padding='max_length', max_length=30, return_tensors='pt')
        return tokens, torch.tensor(labels), torch.tensor(like_counts)
    
train_dataset = SpamDataset(train_data)
val_dataset = SpamDataset(val_data)
test_dataset = SpamDataset(test_data)


# Load Data and collate it
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=val_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=test_dataset.collate_fn)

In [3]:
# Print a random sample from the train_loader to confirm correct loading and tokenization
sample = next(iter(train_loader))
print(sample)

({'input_ids': tensor([[  101,  2293,  1015,  ...,  1003,  1026,   102],
        [  101,  2559,  2012,  ...,  1025,  1055,   102],
        [  101, 14017,  5428,  ..., 24471,  8641,   102],
        ...,
        [  101,  1996,  2154,  ...,   999,  2023,   102],
        [  101,  2023,  2299,  ...,  4604,  2293,   102],
        [  101,  2023,  2299,  ...,  4604,  2293,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}, tensor([ 1, 13,  0,  2,  6,  0,  8,  0,  0,  0,  5,  6,  0,  0,  1,  0,  1,  0,
         4,  0,  2,  1,  3,  0,  1,  0,  0,  0,  0,  0,  2,  1,  1,  2,  1,  0,
        45, 16,  0,  0,  3,  1,  3,  4,  0,  0,  3,  1,  1,  3,  0, 11,  0,  0,
         0, 14,  1,  0,  3,  0,  0,  2,  1,  3]), tensor([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
        0, 0, 

# Model
- Model 1 - include Like count
- Model 2 - include LLM evaluation of the video 

In [4]:
## The network includes an Embedding layer, an Attention layer, an LSTM layer and a Linear layer
class CombinedLSTM(nn.Module):
    def __init__(self, embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout):
        super(CombinedLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_dict, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size + 1, num_classes) # +1 for like count
        
    def forward(self, comment_sequence, like_count):
        # comment_sequence: (batch_size, max_seq_length)
        embedding = self.dropout(self.embedding(comment_sequence))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output, (h_n, c_n) = self.lstm(embedding)
        # output shape: (batch_size, max_seq_length, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size)
        attention_weights = F.softmax(self.attention(output), dim=1)
        # attention_weights shape: (batch_size, max_seq_length, 1)
        out = torch.sum(attention_weights * output, dim=1)
        # out shape: (batch_size, hidden_size)
        out = torch.cat((out, like_count.unsqueeze(1)), 1)
        # out shape: (batch_size, hidden_size + 1)
        out = self.fc(out)
        # out shape: (batch_size, num_classes)
        return out

# Training

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
# Set the hyperparameters for all the models     # TODO: analyze these by model
embedding_dict = tokenizer.vocab_size
embedding_size = 128
hidden_size = 256
num_layers = 2
num_classes = 2
dropout = 0.5
learning_rate = 0.001
num_epochs = 10

# Initialize the model, loss function and optimizer
model = CombinedLSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader):
        tokens, like_counts, labels = batch
        comment_sequence = tokens['input_ids'].to(device)
        like_counts = like_counts.to(device)
        labels = labels.to(device)
        outputs = model(comment_sequence, like_counts)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 1/10, Loss: 1.016178011894226


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 2/10, Loss: 0.7030140161514282


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 3/10, Loss: 0.6746907234191895


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 4/10, Loss: 0.3200976550579071


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 5/10, Loss: 0.3093917667865753


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 6/10, Loss: 0.37294405698776245


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 7/10, Loss: 0.2391223907470703


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 8/10, Loss: 0.3430922031402588


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 9/10, Loss: 0.10597313195466995


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 10/10, Loss: 0.25392237305641174


# Validation

In [6]:
# Validation
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    predictions = []
    targets = []
    model.eval()

    with torch.no_grad():
        for comments, like_counts, target in loader:
            comments = {key: value.to(device) for key, value in comments.items()}
            like_counts = like_counts.to(device)
            target = target.to(device)

            scores = model(comments['input_ids'], like_counts)
            _, predicted = scores.max(1)
            # Store decoded original comments, model predictions and actual labels 
            num_correct += (predicted == target).sum()
            num_samples += predicted.size(0)

            predictions.extend(predicted.tolist())
            targets.extend(target.tolist())

    # model.train()
    accuracy = num_correct/num_samples
    return accuracy, predictions, targets

def results(model):
    train_acc, train_preds, train_targets = check_accuracy(train_loader, model)
    val_acc, val_preds, val_targets = check_accuracy(val_loader, model)
    test_acc, test_preds, test_targets = check_accuracy(test_loader, model)

    print(f'Train accuracy: {train_acc}')
    print(f'Validation accuracy: {val_acc}')
    print(f'Test accuracy: {test_acc}')

    train_cm = confusion_matrix(train_targets, train_preds)
    val_cm = confusion_matrix(val_targets, val_preds)
    test_cm = confusion_matrix(test_targets, test_preds)

    print(f'Train Confusion Matrix:\n{train_cm}')
    print(f'Validation Confusion Matrix:\n{val_cm}')
    print(f'Test Confusion Matrix:\n{test_cm}')

    # Classification report
    train_report = classification_report(train_targets, train_preds)
    val_report = classification_report(val_targets, val_preds)
    test_report = classification_report(test_targets, test_preds)

    print(f'Train Classification Report:\n{train_report}')
    print(f'Validation Classification Report:\n{val_report}')
    print(f'Test Classification Report:\n{test_report}')

In [7]:
results(model)

Train accuracy: 0.9321534037590027
Validation accuracy: 0.8219178318977356
Test accuracy: 0.8904109597206116
Train Confusion Matrix:
[[223   0]
 [ 23  93]]
Validation Confusion Matrix:
[[49  3]
 [10 11]]
Test Confusion Matrix:
[[46  1]
 [ 7 19]]
Train Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       223
           1       1.00      0.80      0.89       116

    accuracy                           0.93       339
   macro avg       0.95      0.90      0.92       339
weighted avg       0.94      0.93      0.93       339

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.94      0.88        52
           1       0.79      0.52      0.63        21

    accuracy                           0.82        73
   macro avg       0.81      0.73      0.76        73
weighted avg       0.82      0.82      0.81        73

Test Classification Report:
         

In [11]:
from torchviz import make_dot
embedding_dict = tokenizer.vocab_size
embedding_size = 128
hidden_size = 256
num_layers = 1
num_classes = 2
dropout = 0.5
learning_rate = 0.001
num_epochs = 10

# Initialize the model, loss function and optimizer
model = CombinedLSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

batch = next(iter(train_loader))
scores = model(batch[0]['input_ids'], batch[1])

print(model)
make_dot(scores, params=dict(list(model.named_parameters()))).render("model_images/like_lstm", format="png")

CombinedLSTM(
  (embedding): Embedding(30522, 128)
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(128, 256, batch_first=True, dropout=0.5)
  (attention): Linear(in_features=256, out_features=1, bias=True)
  (fc): Linear(in_features=257, out_features=2, bias=True)
)




'model_images\\like_lstm.png'