# Installing & Importing Dependencies

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install transformers --quiet

In [3]:
# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# SciKit Learn
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix

# Transformers
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers.models.bert.modeling_bert import BertEmbeddings

# Others
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [4]:
# Global Config
PRE_TRAINED_MODEL_NAME = 'prajjwal1/bert-tiny'                     # Pre-trained model
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)  # Tokenizer

EPOCHS=5              # Number of epochs
MAX_LEN=512            # Maximum Length
BATCH_SIZE=16          # Batch Size
device = 'cuda'        # GPU 

# Reading & Preprocessing Data

In [5]:
### SOURCE, PATH, MSG, SPAM
# data_configs = [
#     ("Youtube", "../input/cs7650-dataset/Dataset/Youtube/youtube_train.csv", '''df["CONTENT"]''', '''df["CLASS"]'''),
#     ("SMS", "https://raw.githubusercontent.com/animesharma3/SPAM-SMS-Detection/master/spam_sms_collection.csv", '''df["msg"]''', '''df["spam"]'''),
#     ("Email", "../input/cs7650-dataset/Dataset/Email/email_spam.csv", '''df["text"]''', '''(1 * (df["label_num"] == 0))'''),
#     ("Twitter", "../input/cs7650-dataset/Dataset/Twitter/train.csv", '''df["Tweet"]''', '''(1 * (df["Type"] == "Quality"))''')
# ]

data_configs = [
    ("Youtube", "youtube_train.csv", '''df["CONTENT"]''', '''df["CLASS"]'''),
    ("SMS", "https://raw.githubusercontent.com/animesharma3/SPAM-SMS-Detection/master/spam_sms_collection.csv", '''df["msg"]''', '''df["spam"]'''),
    ("Email", "email_spam.csv", '''df["text"]''', '''(1 * (df["label_num"] == 1))'''),
    ("Twitter", "train.csv", '''df["Tweet"]''', '''(1 * (df["Type"] == "Spam"))''')
]

data_frames = {}

for config in data_configs:
    src, path, msg, spam = config
    df = pd.read_csv(path)
    df['msg'] = eval(msg)
    df['spam'] = eval(spam)
    df = df[['msg', 'spam']]
    data_frames[src] = df

In [6]:
# Youtube Data
data_frames["Youtube"].head()

Unnamed: 0,msg,spam
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [7]:
# Twitter Data
data_frames["Twitter"].loc[1]

msg     Eren sent a glare towards Mikasa then nodded a...
spam                                                    0
Name: 1, dtype: object

In [8]:
# SMS Data
data_frames["SMS"].head()

Unnamed: 0,msg,spam
0,go jurong point crazy available bugis n great ...,0
1,ok lar joking wif u oni,0
2,free entry wkly comp win fa cup final tkts st ...,1
3,u dun say early hor u c already say,0
4,nah think go usf life around though,0


In [9]:
# Email Data
data_frames["Email"].head()

Unnamed: 0,msg,spam
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [10]:
class SpamDataset(Dataset):
    def __init__(self, spam, msgs, tokenizer, max_len):
        self.msgs = msgs
        self.spam = spam
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.msgs)

    def __getitem__(self, i):
        msg = str(self.msgs[i])
        spam = self.spam[i]

        encoding = self.tokenizer.encode_plus(
            msg, 
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'msg': msg,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'spam': torch.tensor(spam, dtype=torch.long)
        }

In [11]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = SpamDataset(
        spam=df['spam'].to_numpy(),
        msgs=df['msg'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
    )

In [12]:
# Returns train, test, validation data frames
def getDataFrames(source):
    df = data_frames[source]
    df_train, df_test = tts(
        df,
        test_size=0.3,
        shuffle=True,
    )
    df_val, df_test = tts(
        df_test,
        test_size=0.5,
        shuffle=True,
    )
    return df_train, df_test, df_val

In [13]:
# Test getDataFrames with Twitter data

twitter_train, twitter_test, twitter_val = getDataFrames("Twitter")
print("twitter_train", twitter_train.shape)
print("twitter_test", twitter_test.shape)
print("twitter_val", twitter_val.shape)

twitter_train (8377, 2)
twitter_test (1796, 2)
twitter_val (1795, 2)


# Models

### 1. CNN

In [14]:
class CNN(nn.Module):
    def __init__(self, NUM_CLASSES = 2, DIM_EMB = MAX_LEN):
        super(CNN, self).__init__()
        self.Embedding = self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME).get_input_embeddings()
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=DIM_EMB,
                      out_channels=2,
                      kernel_size=ks)
            for ks in range(2, 5)
        ])
        self.ReLU = nn.ReLU()
        self.MaxPool = nn.MaxPool1d
        self.Dropout = nn.Dropout()
        self.Linear = nn.Linear(6, NUM_CLASSES)
        self.LogSoftmax = nn.LogSoftmax(dim=1)

    def forward(self, input_ids, attention_mask):
        E = self.Embedding(input_ids)
        R = [self.ReLU(conv1d(E)) for conv1d in self.conv1d_list]
        M = [self.MaxPool(kernel_size=r.shape[2])(r) for r in R]
        C = torch.cat([m.squeeze(dim=2) for m in M], dim = 1)
        L = self.Linear(C)
        X = self.LogSoftmax(L)
        return X

### 2. BERT

In [15]:
class BERT(nn.Module):
    def __init__(self, n_classes):
        super(BERT, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )[1]
        output = self.drop(pooled_output)
        return self.out(output)

### 3. biLSTM

In [16]:
class biLSTM(nn.Module):
    def __init__(self, h_dim=100, max_len=MAX_LEN, emd_dim=100, lstm_layers=1, dropout_rate=0.2):
        super(biLSTM, self).__init__()
        self.embeddings = self.bert = BertModel.from_pretrained(
            PRE_TRAINED_MODEL_NAME).get_input_embeddings()
#         print(self.embeddings)

        self.text_len = max_len
        self.lstm_layers = lstm_layers
        self.h_dim = h_dim
        
        # self.embedding = nn.Embedding(max_len, max_len, padding_idx=0)
        self.lstm = nn.LSTM(input_size=128,
                            hidden_size=h_dim,
                            num_layers=lstm_layers,
                            batch_first=True,
                            bidirectional=True)
        self.fc = nn.Linear(2 * lstm_layers * h_dim, 2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        N, L = input_ids.shape
        embedding_output = self.embeddings(input_ids)
        out, (h, c) = self.lstm(embedding_output)
        h = h.permute(1, 0, 2).resize(N, 2 * self.lstm_layers * self.h_dim)

        return torch.sigmoid(self.fc(h))

In [17]:
# List of Models
models = {
    "CNN": nn.ModuleDict({
        "model": CNN().to(device),
        "loss_fn": nn.NLLLoss().to(device)
    }),
    "BERT": nn.ModuleDict({
        "model": BERT(n_classes=2).to(device),
        "loss_fn": nn.CrossEntropyLoss().to(device)
    }),
    "biLSTM": nn.ModuleDict({
        "model": biLSTM(h_dim=100, lstm_layers=2).to(device),
        "loss_fn": nn.CrossEntropyLoss().to(device)
    }),
}

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.

# Training Function

In [18]:
def train(
    model,
    loss_fn,
    optimizer,
    scheduler,
    device,
    data_loader,
    n_examples
):
    model = model.train() # Setting Model in training mode

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)  # [16, 512]
        attention_mask = d['attention_mask'].to(device)  # [16, 512]
        targets = d['spam'].to(device)  # [16]

        # Forward Propogation
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        ) # [16, 3]

        # Calculating Loss
        loss = loss_fn(outputs, targets)

        _, preds = torch.max(outputs, dim=1)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        # Backward Propogation
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Clipping Gradient (Exploding Gradient Problem)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad() # Resetting gradients

    train_acc = correct_predictions.double() / n_examples
    train_loss = np.mean(losses)
    
    return train_acc, train_loss

# Evaluation Function

In [19]:
def evaluate_model(
    model,
    loss_fn,
    device,
    data_loader,
    n_examples   
):
    model = model.eval() # Setting Model in evaluation mode

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)  # [16, 512]
            attention_mask = d['attention_mask'].to(device)  # [16, 512]
            targets = d['spam'].to(device)  # [16]

            # Forward Propogation
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            ) # [16, 3]

            # Calculating Loss
            loss = loss_fn(outputs, targets)

            _, preds = torch.max(outputs, dim=1)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
        
    train_acc = correct_predictions.double() / n_examples
    train_loss = np.mean(losses)

    return train_acc, train_loss

In [20]:
def run_epochs(p):    
    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)
        
        train_acc, train_loss = train(
            p["model"],
            p["loss_fn"],
            p["optimizer"],
            p["scheduler"],
            p["device"],
            p["train_data_loader"],
            len(p["df_train"])
        )

        print(f'Train loss {train_loss} accuracy {train_acc}')

        for k, v in p['val_data_loader'].items():
          print(k)
          val_acc, val_loss = evaluate_model(
              p["model"],
              p["loss_fn"],
              p["device"],
              v,
              len(p['df_vals'][k])
          )

          print(f'Validation loss {val_loss} accuracy {val_acc}')
          print()

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), f'M_{p["modelName"]}_D_{p["data_source"]}_best_model_state.bin')
            best_accuracy = val_acc

# Run Models

In [21]:
test_dls = {} 
val_dls = {}

df_vals = {}
df_tests = {}
df_combined = pd.DataFrame([], columns=['msg', 'spam'])

for data_source in data_frames.keys():
  df_train, df_test, df_val = getDataFrames(data_source)
  df_combined = pd.concat([df_combined, df_train])
  test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
  test_dls[data_source] = test_data_loader
  df_tests[data_source] = df_test
  
  val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
  val_dls[data_source] = val_data_loader
  df_vals[data_source] = df_val

df_combined['spam'] = df_combined['spam'].astype(int)
df_combined = df_combined.reset_index(drop=True)

df_train = df_combined
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)

In [24]:
%%time

for modelName in models.keys():
    if modelName == 'CNN':
      continue
      
    print(f"________ MODEL: {modelName} ________")
    
    ### Model Unpacking
    m = models[modelName]
    model = m["model"]
    loss_fn = m["loss_fn"]
    # optimizer = AdamW(model.parameters(), lr=2e-3, correct_bias=False)
    optimizer = optim.Adam(model.parameters(), lr = 0.001)

    print(f"________ {modelName}________")

        # Total Steps & Scheduler
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
    )
    
    # Parameters
    params = {
        "modelName": modelName,
        "data_source": data_source,
        "model": model,
        "loss_fn": loss_fn,
        "optimizer": optimizer,
        "scheduler": scheduler,
        "device": device,
        "train_data_loader": train_data_loader,
        "val_data_loader": val_dls,
        "df_train" :df_train,
        "df_vals": df_vals,
        "df_tests": df_tests
    }

    run_epochs(params)
    
    # test_acc, _ = evaluate_model(
    #     model,
    #     loss_fn,
    #     device,
    #     test_data_loader,
    #     len(df_test)
    # )
    # print(f"TEST ACCURACY: \t {test_acc.item()}")

KeyboardInterrupt: ignored