In [1]:
!pip install transformers tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch.nn as nn
from torch import manual_seed, argmax
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.metrics import confusion_matrix
import gdown
from zipfile import ZipFile

In [3]:
class Sms_dataset(Dataset):
    def __init__(self, dataframe) -> None:
        super().__init__()
        self.X = dataframe["text"].tolist()
        self.Y = dataframe["annotation"].tolist()
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        y_text = self.Y[idx]
        y = 1 if y_text == "spam" else 0
        return x, y

In [4]:
class Model(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        
        self.model = AutoModel.from_pretrained("mariagrandury/roberta-base-finetuned-sms-spam-detection").to("cuda")
        self.tokenizer = AutoTokenizer.from_pretrained("mariagrandury/roberta-base-finetuned-sms-spam-detection")
        self.clf = nn.Linear(768, 2).to("cuda")
        
    def forward(self, x):
        tokens = self.tokenizer(x, return_tensors="pt", padding="longest").to("cuda")
        tokens = tokens.to("cuda")
        out = self.model(**tokens).pooler_output
        return self.clf(out)

In [5]:
def train(dataloader):
    model = Model()
    loss_f = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.clf.parameters(), lr=2e-05)
    scheduler = optim.lr_scheduler.LinearLR(optimizer)
    for epoch in range(2):
        for batch in tqdm(dataloader):
            x, y = batch
            
            optimizer.zero_grad()
            
            out = model(x)
            loss = loss_f(out, y.to("cuda"))
            loss.backward()
            optimizer.step()
            scheduler.step()
            
    return model

In [6]:
def test(dataloader, model):
    model.eval()
    
    y_true, y_pred = [], []
    for batch in dataloader:
        x, y = batch
        
        y_true.extend(list(y))
        out = model(x)
        softed = nn.functional.softmax(out, dim=1)
        pred = argmax(softed, dim=1).detach().tolist()
        y_pred.extend(pred)
                
    return confusion_matrix(y_true, y_pred).ravel()

In [7]:
def download_data():
  zipurl = "https://drive.google.com/file/d/1Y5KBhxOXdFs-hjLSNAjjsaC99wEEAM4x/view?usp=share_link"
  output = "SMS_spam_datasplit.zip"
  gdown.download(url=zipurl, output=output, quiet=False, fuzzy=True)
  with ZipFile(output) as zipf:
    train_df = pd.read_csv(zipf.open("sms_spam_train.tsv"), delimiter="\t")
    test_df = pd.read_csv(zipf.open("sms_spam_test.tsv"), delimiter="\t")

  train_data = Sms_dataset(train_df)
  test_data = Sms_dataset(test_df)
  return train_data, test_data

In [8]:
def main():
    manual_seed(42)
    train_data, test_data = download_data()

    loader = DataLoader(train_data, batch_size=16)
    model = train(loader)

    val_loader = DataLoader(test_data, batch_size=16)
    tn, fp, fn, tp = test(val_loader, model)
    print("tn", tn)
    print("fp", fp)
    print("fn", fn)
    print("tp", tp)
    return tn, fp, fn, tp

In [9]:
def precision(tp, fp):
    return tp/(tp+fp)

def recall(tp, fn):
    return tp/(tp+fn)

def accuracy(tp, tn, fp, fn):
    return (tp+tn)/(tp+tn+fp+fn)

def f1(tp, fp, fn):
    prec = precision(tp, fp)
    rec = recall(tp, fn)
    return 2*prec*rec/(prec + rec)

In [10]:
def spam_is_positive(tn1, fp1, fn1, tp1):
    global tp, tn, fn, fp
    tp, tn, fn, fp = tp1, tn1, fn1, fp1
    
def spam_is_negative():
    global tp, tn, fn, fp
    tn, tp = tp, tn
    fn, fp = fp, fn

In [11]:
tn, fp, fn, tp = main()

Downloading...
From: https://drive.google.com/uc?id=1Y5KBhxOXdFs-hjLSNAjjsaC99wEEAM4x
To: /content/SMS_spam_datasplit.zip
100%|██████████| 287k/287k [00:00<00:00, 74.0MB/s]
Some weights of the model checkpoint at mariagrandury/roberta-base-finetuned-sms-spam-detection were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at mariagrandury/roberta-base-finetuned

tn 964
fp 2
fn 1
tp 148


In [12]:
spam_is_positive(tn, fp, fn, tp)
prec_1 = precision(tp, fp)
rec_1 = recall(tp, fn)
f1_1 = f1(tn, fn, fp)

spam_is_negative()
prec_0 = precision(tp, fp)
rec_0 = recall(tp, fn)
f1_0 = f1(tn, fn, fp)

acc = accuracy(tp, tn, fp, fn)

macro_f1 = (f1_0 + f1_1)/2
macro_prec = (prec_0+prec_1)/2
macro_rec = (rec_0+rec_1)/2

In [13]:
print("accuracy", acc)
print("macro")
print("prec", macro_prec)
print("rec", macro_rec)
print("f1", macro_f1)
print("micro")
print("prec", prec_1)
print("rec", rec_1)
print("f1", f1_1)

accuracy 0.9973094170403587
macro
prec 0.9928151986183074
rec 0.9956090986146429
f1 0.9942064780062664
micro
prec 0.9866666666666667
rec 0.9932885906040269
f1 0.9984464008285863
