In [1]:
from trainer import torch_trainer
import torch
import transformers 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# dataset for spam data

In [2]:
class dataset(torch.utils.data.Dataset):    
    
    def __init__(self, pd_data, tokenizer =  transformers.BertTokenizer.from_pretrained("bert-base-uncased")):
        self.df = pd_data
        self.tokenizer = tokenizer
        self.l = len(self.df.index)
        
    def __len__(self):
        return self.l
    
    def __getitem__(self, idx):
        utterance = self.df.iloc[[idx]].values.tolist()[0][0]
        tag = self.df.iloc[[idx]].values.tolist()[0][1]
        utterance = utterance[:20] + utterance[-20:]

        tokens = self.tokenizer.tokenize(utterance)
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        
        input_tensor = torch.tensor(ids)
        target_tensor = torch.tensor(int(tag))
        
        return input_tensor, target_tensor

# for padding zero and generate attention mask
def create_mini_batch(samples):
        input_tensor = [s[0] for s in samples]
        input_tensor = torch.nn.utils.rnn.pad_sequence(input_tensor, batch_first=True)

        target_tensor = torch.stack([s[1] for s in samples])

        masks_tensors = torch.zeros(input_tensor.shape, dtype=torch.long)
        masks_tensors = masks_tensors.masked_fill(input_tensor != 0, 1)

        return input_tensor.long(), masks_tensors.long(), target_tensor.long() 

# models for spam classify

In [3]:
class spam_classifer1(torch.nn.Module):
    
    def __init__(self, pretrain_model, drop):
        super(spam_classifer1, self).__init__()
        
        self.bert = transformers.BertModel.from_pretrained(pretrain_model)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 2)
        self.dropout = torch.nn.Dropout(p=drop)
        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.sfmax = torch.nn.Softmax(dim = 1)
        
    def forward(self, input_tensor, masks_tensors, target_tensor):
        bert_pool_out = self.bert(input_tensor, attention_mask=masks_tensors).last_hidden_state[:, 0]
        droup_out = self.dropout(bert_pool_out)
        linear_out = self.linear(droup_out)
        s_out = self.sfmax(linear_out)
        
        if target_tensor != None:
            return self.loss_fn(s_out, target_tensor), torch.argmax(s_out, 1)
        else:
            return torch.argmax(s_out, 1)
        

# load data

In [4]:
sms_spam = pd.read_csv("sms_spam.csv")
train_df, test_df = train_test_split(sms_spam, test_size=0.2)

train_ds =  dataset(train_df)
test_ds =  dataset(test_df)
train_df

Unnamed: 0,message,IsSpam,url_count,email_count,phone_count,len
1819,dunno dat's wat he told me. ok lor...,0,0,0,0,37
5287,hey ! don't forget ... you are mine ... for me...,0,0,0,0,113
4521,do u want 2 meet up 2morro,0,0,0,0,26
1153,1000's of girls many local 2 u who r virgins 2...,1,0,0,0,140
423,urgent! your mobile number has been awarded wi...,1,0,0,1,125
...,...,...,...,...,...,...
3342,"i haven't forgotten you, i might have a couple...",0,0,0,0,91
98,hi. wk been ok - on hols now! yes on for a bit...,0,0,0,0,184
4695,a guy who gets used but is too dumb to realize...,0,0,0,0,50
4068,hope youåõre not having too much fun without m...,0,0,0,0,76


# set torch trainer

In [5]:
from transformers import logging
logging.set_verbosity_error()


trainer = torch_trainer()
trainer.set_model_cls(spam_classifer1)
trainer.set_model_parameter({"pretrain_model":"bert-base-uncased", "drop":0.3})

Created folder result
will use cuda:0
model is setted
parameter is setted


# training

In [6]:
trainer.train_find_best_epoch(train_ds, test_ds, batch=20, batch_fn=create_mini_batch, epochs=2)

Creating model ...done
Creating dataloader ...done
Starting training ...
--------------------------------------------------------------------------------
Epoch: 1 Train Loss: 0.4504292996207695
Epoch: 1 Val Loss: 0.4132642149925232
Epoch 1 is current best!!!  test acc: 0.9
save model to result/best.pt
--------------------------------------------------------------------------------
Epoch: 2 Train Loss: 0.4492179697939099
Epoch: 2 Val Loss: 0.4132630228996277
Epoch 2 is current best!!!  test acc: 0.9
save model to result/best.pt
--------------------------------------------------------------------------------
...Epoch 2 is best!!! acc: 0.9


# cross validation

In [7]:
trainer.do_cross_validation(train_ds, k=5, batch=20, batch_fn=create_mini_batch, epochs=2)

initaial a model ...done
Fold [2, 3, 4, 5] as traing set
start training...
Epoch: 1 Train Loss: 0.45070363614591247
Epoch: 2 Train Loss: 0.44914005569812965
... done
start testing...done
Fold 1 Val Loss: 0.5132624506950378
Fold 1 Val Acc: 0.8
--------------------------------------------------------------------------------
initaial a model ...done
Fold [1, 3, 4, 5] as traing set
start training...
Epoch: 1 Train Loss: 0.45023693044089413
Epoch: 2 Train Loss: 0.44921866473595656
... done
start testing...done
Fold 2 Val Loss: 0.5132625102996826
Fold 2 Val Acc: 0.8
--------------------------------------------------------------------------------
initaial a model ...done
Fold [1, 2, 4, 5] as traing set
start training...
Epoch: 1 Train Loss: 0.44996533786769405
Epoch: 2 Train Loss: 0.4492179333094524
... done
start testing...done
Fold 3 Val Loss: 0.5132604837417603
Fold 3 Val Acc: 0.8
--------------------------------------------------------------------------------
initaial a model ...done
Fold

0.8