In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from torch.utils.data import (DataLoader)  # Gives easier dataset managment and creates mini batches
#import torchvision  # torch package for vision related things
#import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
#import torchvision.transforms as transforms
import pandas as pd
import random
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from tqdm.auto import tqdm
from transformers import DistilBertTokenizer

In [None]:
# Split the downloaded Youtube01-Psy.csv file into train, validation and test sets
data = pd.read_csv('SpamDataset/actual_train.csv')
# data = pd.read_csv('TSwift_ShakeItOff_Spam.csv')
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the data
train_data = data[:int(0.85*len(data))]
val_data = data[int(0.85*len(data)):]

test_data = pd.read_csv('SpamDataset/actual_test.csv')
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the data

# Tokenize the data using distillbert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Common dataset for models from LSTM_RNN, LikeCount, and FeatureCombination
class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        comment_tokens = row['CONTENT']
        like_count = row['LIKE_COUNT']
        summary = row['SUMMARY']
        label = row['CLASS']
        return comment_tokens, label, like_count, summary
    
    def collate_fn(self, batch):
        comments = [row[0] for row in batch]
        labels = [row[1] for row in batch]
        like_counts = [row[2] for row in batch]
        summary = [row[3] for row in batch]
        comment_tokens = tokenizer.batch_encode_plus(comments, truncation=True, padding='max_length', max_length=30, return_tensors='pt')
        summary_tokens = tokenizer.batch_encode_plus(summary, truncation=True, padding='max_length', max_length=70, return_tensors='pt')
        return comment_tokens, torch.tensor(labels), torch.tensor(like_counts), summary_tokens

train_dataset = SpamDataset(train_data)
val_dataset = SpamDataset(val_data)
test_dataset = SpamDataset(test_data)

# Load Data and collate it
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=val_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=test_dataset.collate_fn)

# Dataset for KaggleWithSummary models
class CommentSummaryDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        comment_tokens = row['CONTENT'] + ' ' + row['SUMMARY']
        label = row['CLASS']
        return comment_tokens, label
    
    def collate_fn(self, batch):
        comments = [row[0] for row in batch]
        labels = [row[1] for row in batch]
        tokens = tokenizer.batch_encode_plus(comments, truncation=True, padding='max_length', max_length=100, return_tensors='pt')
        return tokens,  torch.tensor(labels)
    
class SummaryCommentDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        comment_tokens = row['SUMMARY'] + ' ' + row['CONTENT'] 
        label = row['CLASS']
        return comment_tokens, label
    
    def collate_fn(self, batch):
        comments = [row[0] for row in batch]
        labels = [row[1] for row in batch]
        tokens = tokenizer.batch_encode_plus(comments, truncation=True, padding='max_length', max_length=100, return_tensors='pt')
        return tokens,  torch.tensor(labels)
    
fw_train_dataset = CommentSummaryDataset(train_data)
fw_val_dataset = CommentSummaryDataset(val_data)
fw_test_dataset = CommentSummaryDataset(test_data)

bw_train_dataset = SummaryCommentDataset(train_data)
bw_val_dataset = SummaryCommentDataset(val_data)
bw_test_dataset = SummaryCommentDataset(test_data)

# Load Data and collate it
batch_size = 64
fw_train_loader = DataLoader(fw_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=fw_train_dataset.collate_fn)
fw_val_loader = DataLoader(fw_val_dataset, batch_size=batch_size, shuffle=True, collate_fn=fw_val_dataset.collate_fn)
fw_test_loader = DataLoader(fw_test_dataset, batch_size=batch_size, shuffle=True, collate_fn=fw_test_dataset.collate_fn)

bw_train_loader = DataLoader(bw_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=bw_train_dataset.collate_fn)
bw_val_loader = DataLoader(bw_val_dataset, batch_size=batch_size, shuffle=True, collate_fn=bw_val_dataset.collate_fn)
bw_test_loader = DataLoader(bw_test_dataset, batch_size=batch_size, shuffle=True, collate_fn=bw_test_dataset.collate_fn)

# All the Models

In [None]:
# Recurrent neural network with LSTM (many-to-one) for sequence classification to produce a binary output
## The network includes an Embedding layer, an LSTM layer and a Linear layer
class RNN_LSTM(nn.Module):
    def __init__(self, embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_dict, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (batch_size, max_seq_length)
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output, (h_n, c_n) = self.lstm(embedding)
        # output shape: (batch_size, max_seq_length, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size)
        out = self.fc(h_n[-1, :, :])
        # out shape: (batch_size, num_classes)
        return out
    
# Bidirectional LSTM network for sequence classification to produce a binary output
## The network includes an Embedding layer, a Bidirectional LSTM layer and a Linear layer
class BiLSTM(nn.Module):
    def __init__(self, embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_dict, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        

    def forward(self, x):
        # x: (batch_size, max_seq_length)
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output, (h_n, c_n) = self.lstm(embedding)
        # output shape: (batch_size, max_seq_length, hidden_size*2)
        # h_n shape: (num_layers*2, batch_size, hidden_size)
        out = self.fc(torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1))
        # out shape: (batch_size, num_classes)
        return out

# An Attention-LSTM network for sequence classification to produce a binary output
## The network includes an Embedding layer, an Attention layer, an LSTM layer and a Linear layer
class AttentionLSTM(nn.Module):
    def __init__(self, embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout):
        super(AttentionLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_dict, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # x: (batch_size, max_seq_length)
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output, (h_n, c_n) = self.lstm(embedding)
        # output shape: (batch_size, max_seq_length, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size)
        attention_weights = F.softmax(self.attention(output), dim=1)
        # attention_weights shape: (batch_size, max_seq_length, 1)
        out = torch.sum(attention_weights * output, dim=1)
        # out shape: (batch_size, hidden_size)
        out = self.fc(out)
        # out shape: (batch_size, num_classes)
        return out
    
## The network includes a feature class addition of LikeCount to the AttentionLSTM network
class LikeCombinedLSTM(nn.Module):
    def __init__(self, embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout):
        super(LikeCombinedLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_dict, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size + 1, num_classes) # +1 for like count
        
    def forward(self, comment_sequence, like_count):
        # comment_sequence: (batch_size, max_seq_length)
        embedding = self.dropout(self.embedding(comment_sequence))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output, (h_n, c_n) = self.lstm(embedding)
        # output shape: (batch_size, max_seq_length, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size)
        attention_weights = F.softmax(self.attention(output), dim=1)
        # attention_weights shape: (batch_size, max_seq_length, 1)
        out = torch.sum(attention_weights * output, dim=1)
        # out shape: (batch_size, hidden_size)
        out = torch.cat((out, like_count.unsqueeze(1)), 1)
        # out shape: (batch_size, hidden_size + 1)
        out = self.fc(out)
        # out shape: (batch_size, num_classes)
        return out
    
# The Network includes a feature class addition of Summary to the AttentionLSTM network
class SummaryCombinedLSTM(nn.Module):
    def __init__(self, embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout):
        super(SummaryCombinedLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_dict, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, comment_sequence):
        # comment_sequence: (batch_size, max_seq_length)
        embedding = self.dropout(self.embedding(comment_sequence))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output, (h_n, c_n) = self.lstm(embedding)
        # output shape: (batch_size, max_seq_length, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size)
        attention_weights = F.softmax(self.attention(output), dim=1)
        # attention_weights shape: (batch_size, max_seq_length, 1)
        out = torch.sum(attention_weights * output, dim=1)
        # out shape: (batch_size, hidden_size)
        out = self.fc(out)
        # out shape: (batch_size, num_classes)
        return out
    
# The Network includes a feature class addition of both Summary and LikeCount to the AttentionLSTM network
class FeatureCombinedLSTM(nn.Module):
    def __init__(self, embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout):
        super(FeatureCombinedLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_dict, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attention = nn.Linear(hidden_size, 1)


        self.embedding_summary = nn.Embedding(embedding_dict, embedding_size)
        self.dropout_summary = nn.Dropout(dropout)
        self.lstm_summary = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attention_summary = nn.Linear(hidden_size, 1)


        self.fc = nn.Linear(2 * hidden_size + 1, num_classes) # +1 for like count
        
    def forward(self, comment_sequence, like_count, summary_sequence):
        # Evaluate the comment sequence
        # comment_sequence: (batch_size, max_seq_length)
        comment_embedding = self.dropout(self.embedding(comment_sequence))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output_comments, (h_n, c_n) = self.lstm(comment_embedding)
        # output shape: (batch_size, max_seq_length, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size)
        attention_weights = F.softmax(self.attention(output_comments), dim=1)
        # attention_weights shape: (batch_size, max_seq_length, 1)
        output_comments = torch.sum(attention_weights * output_comments, dim=1)

        # Evaluate the summary sequence
        # summary_sequence: (batch_size, max_seq_length)
        summary_embedding = self.dropout_summary(self.embedding_summary(summary_sequence))
        # embedding shape: (batch_size, max_seq_length, embedding_size)
        output_summary, (h_n, c_n) = self.lstm_summary(summary_embedding)
        # output shape: (batch_size, max_seq_length, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size)
        attention_weights = F.softmax(self.attention_summary(output_summary), dim=1)
        # attention_weights shape: (batch_size, max_seq_length, 1)
        output_summary = torch.sum(attention_weights * output_summary, dim=1)

        # Combine the comment and summary feature outputs with the like count
        # out shape: (batch_size, hidden_size)
        out = torch.cat((output_comments, output_summary, like_count.unsqueeze(1)), 1)
        # out shape: (batch_size, hidden_size + 1)
        out = self.fc(out)
        # out shape: (batch_size, num_classes)
        return out

In [None]:
# Validation
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    predictions = []
    targets = []
    model.eval()

    with torch.no_grad():
        for comments, target, like_count, summary in loader:
            comments = {key: value.to(device) for key, value in comments.items()}
            target = target.to(device)

            scores = model(comments['input_ids'])
            _, predicted = scores.max(1)
            # Store decoded original comments, model predictions and actual labels 
            num_correct += (predicted == target).sum()
            num_samples += predicted.size(0)

            predictions.extend(predicted.tolist())
            targets.extend(target.tolist())

    # model.train()
    accuracy = num_correct/num_samples
    return accuracy, predictions, targets

def results(model, train_loader, val_loader):
    train_acc, train_preds, train_targets = check_accuracy(train_loader, model)
    val_acc, val_preds, val_targets = check_accuracy(val_loader, model)
    test_acc, test_preds, test_targets = check_accuracy(test_loader, model)

    print(f'Train accuracy: {train_acc}')
    print(f'Validation accuracy: {val_acc}')
    print(f'Test accuracy: {test_acc}')

    train_cm = confusion_matrix(train_targets, train_preds)
    val_cm = confusion_matrix(val_targets, val_preds)
    test_cm = confusion_matrix(test_targets, test_preds)

    print(f'Train Confusion Matrix:\n{train_cm}')
    print(f'Validation Confusion Matrix:\n{val_cm}')
    print(f'Test Confusion Matrix:\n{test_cm}')

    # Classification report
    train_report = classification_report(train_targets, train_preds)
    val_report = classification_report(val_targets, val_preds)
    test_report = classification_report(test_targets, test_preds)

    print(f'Train Classification Report:\n{train_report}')
    print(f'Validation Classification Report:\n{val_report}')
    print(f'Test Classification Report:\n{test_report}')

In [None]:
## Common device and criterion
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()s

## Original 3 modeels
embedding_dict = tokenizer.vocab_size
embedding_size = 128
hidden_size = 256
NUM_LAYERS = [4]
num_classes = 2
DROPOUT = [0.4]
LR = [0.0007]
NUM_EPOCH = [40]

for num_layers in NUM_LAYERS:
    for dropout in DROPOUT:
        for learning_rate in LR:
            for num_epochs in NUM_EPOCH:
                model_LSTM = RNN_LSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
                optimizer_LSTM = optim.Adam(model_LSTM.parameters(), lr=learning_rate)
                # RNN_LSTM
                print('RNN_LSTM Training')
                for epoch in range(num_epochs):
                    model_LSTM.train()
                    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
                    for batch_idx, (data, targets) in loop:
                        data = {key: value.to(device) for key, value in data.items()}
                        targets = targets.to(device)
                        # forward
                        scores = model_LSTM(data['input_ids'])
                        loss = criterion(scores, targets)
                        # backward
                        optimizer_LSTM.zero_grad()
                        loss.backward()
                        # gradient descent or adam step
                        optimizer_LSTM.step()
                        # update progress bar
                        loop.set_description(f'Epoch [{epoch}/{num_epochs}]')
                        loop.set_postfix(loss=loss.item())
                # init BiLSTM model
                model_BiLSTM = BiLSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
                optimizer_BiLSTM = optim.Adam(model_BiLSTM.parameters(), lr=learning_rate)
                # BiLSTM
                print('BiLSTM Training')
                for epoch in range(num_epochs):
                    model_BiLSTM.train()
                    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
                    for batch_idx, (data, targets) in loop:
                        data = {key: value.to(device) for key, value in data.items()}
                        targets = targets.to(device)
                        # forward
                        scores = model_BiLSTM(data['input_ids'])
                        loss = criterion(scores, targets)
                        # backward
                        optimizer_BiLSTM.zero_grad()
                        loss.backward()
                        # gradient descent or adam step
                        optimizer_BiLSTM.step()
                        # update progress bar
                        loop.set_description(f'Epoch [{epoch}/{num_epochs}]')
                        loop.set_postfix(loss=loss.item())

                # init AttentionLSTM model
                model_AttentionLSTM = AttentionLSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
                optimizer_AttentionLSTM = optim.Adam(model_AttentionLSTM.parameters(), lr=learning_rate)

                # AttentionLSTM
                print('AttentionLSTM Training')
                for epoch in range(num_epochs):
                    model_AttentionLSTM.train()
                    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
                    for batch_idx, (data, targets) in loop:
                        data = {key: value.to(device) for key, value in data.items()}
                        targets = targets.to(device)
                        # forward
                        scores = model_AttentionLSTM(data['input_ids'])
                        loss = criterion(scores, targets)
                        # backward
                        optimizer_AttentionLSTM.zero_grad()
                        loss.backward()
                        # gradient descent or adam step
                        optimizer_AttentionLSTM.step()
                        # update progress bar
                        loop.set_description(f'Epoch [{epoch}/{num_epochs}]')
                        loop.set_postfix(loss=loss.item())

## 2 Summary Models
embedding_dict = tokenizer.vocab_size
embedding_size = 128
hidden_size = 256
num_layers = 4
num_classes = 2
dropout = 0.4
learning_rate = 0.0007
num_epochs = 40

# Train the forwards model
# Initialize the model, loss function and optimizer
modelFW = SummaryCombinedLSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
optimizer = optim.Adam(modelFW.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    modelFW.train()
    for batch in tqdm(fw_train_loader):
        tokens, labels = batch
        comment_sequence = tokens['input_ids'].to(device)
        labels = labels.to(device)
        outputs = modelFW(comment_sequence)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')


# Train the backwards model
# Initialize the model, loss function and optimizer
modelBW = SummaryCombinedLSTM(embedding_dict, embedding_size, hidden_size, num_layers, num_classes, dropout).to(device)
optimizer = optim.Adam(modelBW.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    modelBW.train()
    for batch in tqdm(bw_train_loader):
        tokens, labels = batch
        comment_sequence = tokens['input_ids'].to(device)
        labels = labels.to(device)
        outputs = modelBW(comment_sequence)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')



In [None]:
# Print the results of the forwards model
results(modelFW, fw_train_loader, fw_val_loader, fw_test_loader)
results(modelBW, bw_train_loader, bw_val_loader, bw_test_loader)