In [0]:
import torch
dev = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(dev)
print(dev)

cuda


In [0]:
# from google.colab import files
# files.upload()
# !pip install -q kaggle
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !kaggle datasets download -d uciml/sms-spam-collection-dataset
# !unzip sms-spam-collection-dataset.zip

In [0]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
nltk.download('punkt')
nltk.download('stopwords')
tokenizer = RegexpTokenizer(r'\w+')
tqdm.pandas()

MAX_LEN = 128
def make_clean(s) :
  tokens = np.array(tokenizer.tokenize(s.lower()))
  tokens = tokens[~np.isin(tokens, stopwords.words())]
  return ' '.join(tokens)

df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
df['clean'] = df['v2'].progress_apply(make_clean)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


HBox(children=(FloatProgress(value=0.0, max=5572.0), HTML(value='')))




In [0]:
!pip install transformers
from transformers import BertTokenizer
btokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
attention_masks = []

for sent in tqdm(df['clean']):
    encoded_dict = btokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor((df['v1'] == 'ham').astype(int).tolist())



HBox(children=(FloatProgress(value=0.0, max=5572.0), HTML(value='')))




In [0]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 32

dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [0]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
import torch.nn as nn

# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
#         self.bert = BertForSequenceClassification.from_pretrained(
#                       "bert-base-uncased",
#                       num_labels = 2,
#                       output_attentions = False,
#                       output_hidden_states = False,
#                     )

#     def forward(self, x):
#         x = torch.softmax(self.bert(x)[0], dim=1)
#         return x

# model = Net()
model = BertForSequenceClassification.from_pretrained(
                      "bert-base-uncased",
                      num_labels = 2,
                      output_attentions = False,
                      output_hidden_states = False,
                    )
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
from transformers import get_linear_schedule_with_warmup
epochs = 50

optimizer = AdamW(model.parameters(), lr = 2e-4)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [0]:
import random
import numpy as np

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def train_epoch(model) :
    total_train_loss = 0
    for batch in train_dataloader :
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()        
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)
    return avg_train_loss

def cal_metrics(y_true, y_pred) :
  fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred, pos_label=1)
  return {
      "Accuracy": metrics.accuracy_score(y_true, y_pred),
      "AUC" : metrics.auc(fpr, tpr),
      "f1" : metrics.f1_score(y_true, y_pred, average='macro'),
      "Recall" : metrics.recall_score(y_true, y_pred, average='macro'),
      "Precision" : metrics.precision_score(y_true, y_pred, average='macro'),
  }



from sklearn import metrics
from scipy.special import softmax

def test_epoch(model) :
  true_labels, predictions = [], []
  total_eval_loss = 0
  for batch in validation_dataloader :
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
      with torch.no_grad():        
          (loss, logits) = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask,
                                  labels=b_labels)
      total_eval_loss += loss.item()
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      true_labels += label_ids.tolist()
      predictions += softmax(logits, axis=1).argmax(axis=1).tolist()
  avg_val_loss = total_eval_loss / len(validation_dataloader)
  return avg_val_loss, cal_metrics(true_labels, predictions)

train_losses = list()
test_losses = list()
test_metrics = list()
for epoch_i in tqdm(range(epochs)):
  model.train()
  train_losses.append(train_epoch(model))
  model.eval()
  test_res = test_epoch(model)
  test_losses.append(test_res[0])
  test_metrics.append(test_res[1])

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
df = pd.DataFrame(test_metrics)
df['loss'] = test_losses
df['train_loss'] = train_losses
df.to_csv('records.csv')
df.head()

Unnamed: 0,Accuracy,AUC,f1,Recall,Precision,loss,train_loss
0,0.978475,0.921569,0.951287,0.921569,0.98783,0.101303,0.133377
1,0.980269,0.933601,0.956141,0.933601,0.982389,0.102615,0.106446
2,0.86278,0.5,0.463168,0.5,0.43139,0.405313,0.370703
3,0.86278,0.5,0.463168,0.5,0.43139,0.404366,0.400007
4,0.86278,0.5,0.463168,0.5,0.43139,0.400633,0.396784
