In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from torch.utils.data import (DataLoader,)  # Gives easier dataset managment and creates mini batches
#import torchvision  # torch package for vision related things
#import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
#import torchvision.transforms as transforms
import pandas as pd
import random
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from tqdm.auto import tqdm
from transformers import DistilBertTokenizer

# Data Loading
Grabbing the combined dataset and including the summary 

In [None]:
# Split the downloaded Youtube01-Psy.csv file into train, validation and test sets
data = pd.read_csv('SpamDataset/actual_train.csv')
# data = pd.read_csv('TSwift_ShakeItOff_Spam.csv')
# data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the data
train_data = data[:int(0.85*len(data))]
val_data = data[int(0.85*len(data)):]
test_data = pd.read_csv('SpamDataset/actual_test.csv')

# Tokenize the data using distillbert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Turn pd dataframe into a tokenized PyTorch dataset that DataLoader can use via the SpamDataset class
class CommentSummaryDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        comment_tokens = row['CONTENT'] + ' ' + row['SUMMARY']
        label = row['CLASS']
        return comment_tokens, label
    
    def collate_fn(self, batch):
        comments = [row[0] for row in batch]
        labels = [row[1] for row in batch]
        tokens = tokenizer.batch_encode_plus(comments, truncation=True, padding='max_length', max_length=100, return_tensors='pt')
        return tokens,  torch.tensor(labels)
    
class SummaryCommentDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        comment_tokens = row['SUMMARY'] + ' ' + row['CONTENT'] 
        label = row['CLASS']
        return comment_tokens, label
    
    def collate_fn(self, batch):
        comments = [row[0] for row in batch]
        labels = [row[1] for row in batch]
        tokens = tokenizer.batch_encode_plus(comments, truncation=True, padding='max_length', max_length=100, return_tensors='pt')
        return tokens,  torch.tensor(labels)
    
fw_train_dataset = CommentSummaryDataset(train_data)
fw_val_dataset = CommentSummaryDataset(val_data)
fw_test_dataset = CommentSummaryDataset(test_data)

bw_train_dataset = SummaryCommentDataset(train_data)
bw_val_dataset = SummaryCommentDataset(val_data)
bw_test_dataset = SummaryCommentDataset(test_data)


# Load Data and collate it
batch_size = 64
fw_train_loader = DataLoader(fw_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=fw_train_dataset.collate_fn)
fw_val_loader = DataLoader(fw_val_dataset, batch_size=batch_size, shuffle=True, collate_fn=fw_val_dataset.collate_fn)
fw_test_loader = DataLoader(fw_test_dataset, batch_size=batch_size, shuffle=True, collate_fn=fw_val_dataset.collate_fn)

bw_train_loader = DataLoader(bw_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=bw_train_dataset.collate_fn)
bw_val_loader = DataLoader(bw_val_dataset, batch_size=batch_size, shuffle=True, collate_fn=bw_val_dataset.collate_fn)
bw_test_loader= DataLoader(bw_test_dataset, batch_size=batch_size, shuffle=True, collate_fn=fw_val_dataset.collate_fn)

In [None]:
# Print a random sample from the train_loader to confirm correct loading and tokenization
sample = next(iter(fw_train_loader))
print(sample)
# Print decoded tokens for the sample
decoded = tokenizer.decode(sample[0]['input_ids'][0])
print(decoded)

sample = next(iter(bw_train_loader))
print(sample)
# Print decoded tokens for the sample
decoded = tokenizer.decode(sample[0]['input_ids'][0])
print(decoded)

({'input_ids': tensor([[  101,  1996,  2087,  ..., 24768,  2007,   102],
        [  101, 10166,  6796,  ...,  2299,  1005,   102],
        [  101,  2040,  2842,  ...,     0,     0,     0],
        ...,
        [  101,  4638,  2041,  ...,     0,     0,     0],
        [  101,  4658,  1048,  ...,     0,     0,     0],
        [  101,  2190,  1012,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}, tensor([0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
        0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
        1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0]))
[CLS] the most watched video on youtube is psy ’ s “ gangnam style ”, with 2. 1 billion views. psy - gangnam style ( 강남스타일 ) m / v psy's " gangnam style " bec

# Model
- Model 1 - include Like count
- Model 2 - include LLM evaluation of the video 

In [None]:
## The network includes an Embedding layer, an Attention layer, an LSTM layer and a Linear layer
class CombinedLSTM(nn.Module):
    def __init__(self, embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout):
        super(CombinedLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_dict, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, comment_sequence):
        # comment_sequence: (batch_size, max_seq_length)
        embedding = self.dropout(self.embedding(comment_sequence))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output, (h_n, c_n) = self.lstm(embedding)
        # output shape: (batch_size, max_seq_length, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size)
        attention_weights = F.softmax(self.attention(output), dim=1)
        # attention_weights shape: (batch_size, max_seq_length, 1)
        out = torch.sum(attention_weights * output, dim=1)
        # out shape: (batch_size, hidden_size)
        out = self.fc(out)
        # out shape: (batch_size, num_classes)
        return out

# Utility Functions to run validation

In [None]:
# Validation
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    predictions = []
    targets = []
    model.eval()

    with torch.no_grad():
        for comments, target in loader:
            comments = {key: value.to(device) for key, value in comments.items()}
            target = target.to(device)

            scores = model(comments['input_ids'])
            _, predicted = scores.max(1)
            # Store decoded original comments, model predictions and actual labels 
            num_correct += (predicted == target).sum()
            num_samples += predicted.size(0)

            predictions.extend(predicted.tolist())
            targets.extend(target.tolist())

    # model.train()
    accuracy = num_correct/num_samples
    return accuracy, predictions, targets

def results(model, train_loader, val_loader, test_loader):
    train_acc, train_preds, train_targets = check_accuracy(train_loader, model)
    val_acc, val_preds, val_targets = check_accuracy(val_loader, model)
    test_acc, test_preds, test_targets, train_loss = check_accuracy(test_loader, model)

    print(f'Train accuracy: {train_acc}')
    print(f'Validation accuracy: {val_acc}')
    print(f'Test accuracy: {test_acc}')

    train_cm = confusion_matrix(train_targets, train_preds)
    val_cm = confusion_matrix(val_targets, val_preds)
    test_cm = confusion_matrix(test_targets, test_preds)

    print(f'Train Confusion Matrix:\n{train_cm}')
    print(f'Validation Confusion Matrix:\n{val_cm}')
    print(f'Test Confusion Matrix:\n{test_cm}')

    # Classification report
    train_report = classification_report(train_targets, train_preds)
    val_report = classification_report(val_targets, val_preds)
    test_report = classification_report(test_targets, test_preds)


    print(f'Train Classification Report:\n{train_report}')
    print(f'Validation Classification Report:\n{val_report}')
    print(f'Test Classification Report:\n{test_report}')

# Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
# Set the hyperparameters for all the models     # TODO: analyze these by model
embedding_dict = tokenizer.vocab_size
embedding_size = 128
hidden_size = 256
num_layers = 4
num_classes = 2
dropout = 0.4
learning_rate = 0.0007
num_epochs = 40

# Train the forwards model
# Initialize the model, loss function and optimizer
modelFW = CombinedLSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
optimizer = optim.Adam(modelFW.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    modelFW.train()
    for batch in tqdm(fw_train_loader):
        tokens, labels = batch
        comment_sequence = tokens['input_ids'].to(device)
        labels = labels.to(device)
        outputs = modelFW(comment_sequence)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Print the results of the forwards model
results(modelFW, fw_train_loader, fw_val_loader, fw_test_loader)

# Train the backwards model
# Initialize the model, loss function and optimizer
modelBW = CombinedLSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
optimizer = optim.Adam(modelBW.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    modelBW.train()
    for batch in tqdm(bw_train_loader):
        tokens, labels = batch
        comment_sequence = tokens['input_ids'].to(device)
        labels = labels.to(device)
        outputs = modelBW(comment_sequence)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Print the results of the backwards model
results(modelBW, bw_train_loader, bw_val_loader, bw_test_loader)




  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 1/40, Loss: 0.6995852589607239


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 2/40, Loss: 0.47050419449806213


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 3/40, Loss: 0.3354197144508362


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 4/40, Loss: 0.47269701957702637


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 5/40, Loss: 0.344296932220459


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 6/40, Loss: 0.48157593607902527


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 7/40, Loss: 0.3177638053894043


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 8/40, Loss: 0.2772403657436371


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 9/40, Loss: 0.41918259859085083


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 10/40, Loss: 0.19771935045719147


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 11/40, Loss: 0.41022467613220215


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 12/40, Loss: 0.23900748789310455


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 13/40, Loss: 0.08818535506725311


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 14/40, Loss: 0.3422156274318695


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 15/40, Loss: 0.2120303362607956


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 16/40, Loss: 0.16286133229732513


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 17/40, Loss: 0.1432897001504898


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 18/40, Loss: 0.10983417183160782


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 19/40, Loss: 0.18305456638336182


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 20/40, Loss: 0.08749538660049438


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 21/40, Loss: 0.17420658469200134


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 22/40, Loss: 0.06523998826742172


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 23/40, Loss: 0.08697051554918289


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 24/40, Loss: 0.3115880489349365


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 25/40, Loss: 0.04987706243991852


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 26/40, Loss: 0.11403178423643112


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 27/40, Loss: 0.037325404584407806


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 28/40, Loss: 0.014215231873095036


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 29/40, Loss: 0.09841102361679077


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 30/40, Loss: 0.1035190001130104


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 31/40, Loss: 0.012498479336500168


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 32/40, Loss: 0.06005482003092766


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 33/40, Loss: 0.009120361879467964


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 34/40, Loss: 0.01316457986831665


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 35/40, Loss: 0.14449191093444824


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 36/40, Loss: 0.0061163343489170074


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 37/40, Loss: 0.014194836840033531


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 38/40, Loss: 0.05373930558562279


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 39/40, Loss: 0.012771537527441978


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 40/40, Loss: 0.004924430046230555
Train accuracy: 0.9983721971511841
Validation accuracy: 0.9447852969169617
Train Confusion Matrix:
[[995   0]
 [  3 845]]
Validation Confusion Matrix:
[[181   4]
 [ 14 127]]
Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       995
           1       1.00      1.00      1.00       848

    accuracy                           1.00      1843
   macro avg       1.00      1.00      1.00      1843
weighted avg       1.00      1.00      1.00      1843

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95       185
           1       0.97      0.90      0.93       141

    accuracy                           0.94       326
   macro avg       0.95      0.94      0.94       326
weighted avg       0.95      0.94      0.94       326



  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 1/40, Loss: 0.6199892163276672


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 2/40, Loss: 0.5178014039993286


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 3/40, Loss: 0.36409199237823486


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 4/40, Loss: 0.2492581307888031


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 5/40, Loss: 0.21088546514511108


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 6/40, Loss: 0.10304641723632812


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 7/40, Loss: 0.2955499589443207


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 8/40, Loss: 0.06632604449987411


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 9/40, Loss: 0.41847988963127136


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 10/40, Loss: 0.19332176446914673


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 11/40, Loss: 0.07992631942033768


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 12/40, Loss: 0.17640449106693268


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 13/40, Loss: 0.043019913136959076


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 14/40, Loss: 0.0367523729801178


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 15/40, Loss: 0.12540782988071442


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 16/40, Loss: 0.06820732355117798


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 17/40, Loss: 0.03638877719640732


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 18/40, Loss: 0.0353793166577816


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 19/40, Loss: 0.08461526781320572


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 20/40, Loss: 0.18869952857494354


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 21/40, Loss: 0.13073986768722534


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 22/40, Loss: 0.039547406136989594


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 23/40, Loss: 0.007553593255579472


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 24/40, Loss: 0.06143418326973915


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 25/40, Loss: 0.0208310317248106


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 26/40, Loss: 0.18860091269016266


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 27/40, Loss: 0.01918468251824379


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 28/40, Loss: 0.027041120454669


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 29/40, Loss: 0.060656748712062836


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 30/40, Loss: 0.008557384833693504


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 31/40, Loss: 0.014815925620496273


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 32/40, Loss: 0.0022921611089259386


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 33/40, Loss: 0.00228399527259171


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 34/40, Loss: 0.006203039083629847


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 35/40, Loss: 0.0030810581520199776


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 36/40, Loss: 0.0017482737312093377


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 37/40, Loss: 0.06201678514480591


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 38/40, Loss: 0.002373198978602886


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 39/40, Loss: 0.004469446837902069


  0%|          | 0/29 [00:00<?, ?it/s]

Epoch 40/40, Loss: 0.01517748087644577
Train accuracy: 0.9989148378372192
Validation accuracy: 0.953987717628479
Train Confusion Matrix:
[[995   0]
 [  2 846]]
Validation Confusion Matrix:
[[177   8]
 [  7 134]]
Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       995
           1       1.00      1.00      1.00       848

    accuracy                           1.00      1843
   macro avg       1.00      1.00      1.00      1843
weighted avg       1.00      1.00      1.00      1843

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       185
           1       0.94      0.95      0.95       141

    accuracy                           0.95       326
   macro avg       0.95      0.95      0.95       326
weighted avg       0.95      0.95      0.95       326



In [None]:
from torchviz import make_dot
embedding_dict = tokenizer.vocab_size
embedding_size = 128
hidden_size = 256
num_layers = 1
num_classes = 2
dropout = 0.4
learning_rate = 0.0007
num_epochs = 15

# Initialize the model, loss function and optimizer
model = CombinedLSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

batch = next(iter(fw_train_loader))
scores = model(batch[0]['input_ids'])

print(model)
make_dot(scores, params=dict(list(model.named_parameters()))).render("model_images/sequence_combined_lstm", format="png")



CombinedLSTM(
  (embedding): Embedding(30522, 128)
  (dropout): Dropout(p=0.4, inplace=False)
  (lstm): LSTM(128, 256, batch_first=True, dropout=0.4)
  (attention): Linear(in_features=256, out_features=1, bias=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)


'model_images\\sequence_combined_lstm.png'