In [1]:
import pickle
import torch
import torch.nn as nn
import numpy as np

from transformers import BertTokenizer

In [2]:
from models import model_1 as mm
import pandas as pd

### Initalizing Model

In [3]:
d_model = 768
max_token = 512
nhead = 12
nlayer = 6
device = "cuda"

In [4]:
with open("bert_embedding.pkl", "rb") as file:
    embd = pickle.load(file)

In [5]:
model = mm.ad_transformer(d_model,nhead, nlayer, max_token, embd, device).to(device)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### Initializing Default Weights

In [7]:
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

### Loading Dataset

In [8]:
df = pd.read_csv("dataset/spam_train.csv")

In [9]:
df = df.drop("len", axis = 1)

In [10]:
df.head()

Unnamed: 0,text_type,text
0,spam,perfect visual solution for your business now ...
1,spam,an innovative plan for today s market give you...
2,spam,all graphics software available cheap oem vers...
3,spam,perfect logo charset koi 8 r thinking of breat...
4,spam,back to happy and healthy life we ve created a...


In [11]:
### Removing Sequence more than 500 tokens

In [12]:
tok_threshold = []
for i in range(len(df) - 1, -1, -1):
    tok_len = len(tokenizer(df.iloc[i,1], return_tensors="pt", padding=True)["input_ids"][0])
    if tok_len > 500:
        tok_threshold.append(i)

Token indices sequence length is longer than the specified maximum sequence length for this model (965 > 512). Running this sequence through the model will result in indexing errors


In [13]:
tok_threshold

[859]

In [14]:
df.drop(859, inplace=True)

In [15]:
df = df.iloc[:10000, :]

In [16]:
df = df.sample(frac=1).reset_index(drop=True)

### Initializng Loss and Update Function

In [17]:
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [18]:
criterian = nn.CrossEntropyLoss(reduction='none')

### Training Model

In [21]:
### Assuming [spam, ham]

In [29]:
batch_size = 50
total_datapoint = 10000
num_epoch = 2

In [32]:
total_loss = 0
for epoch in range(num_epoch):

    
    print("### Epoch " + str(epoch) + " Completed. ###")
    print("Avg Loss : " + str(batch_size*total_loss/total_datapoint))
    total_loss = 0
    for index in range(0, total_datapoint, batch_size):
        if index % 1000 == 0:
            print("1000 Datapoint Completed")
        
        optim.zero_grad()
        
        input_list = df.iloc[index:index+batch_size, 1].tolist()
        tok_input = tokenizer(input_list, return_tensors="pt", padding=True).to(device)
        
        out = model(tok_input)
        
        tar_list = []

        for i in range(index, index + batch_size):
            if df.iloc[i,0] == "spam":
                tar_list.append([1,0])
            else:
                tar_list.append([0,1])

        target = torch.tensor(tar_list, dtype = torch.float).to(device)
        
        loss = criterian(out, target)
        avg_loss = loss.sum()/batch_size
        
        total_loss = max(total_loss, avg_loss)   ## recording
        
        avg_loss.backward()
        optim.step()

### Epoch 0 Completed. ###
Avg Loss : 0.0
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
### Epoch 1 Completed. ###
Avg Loss : tensor(0.0033, device='cuda:0', grad_fn=<DivBackward0>)
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed
1000 Datapoint Completed


In [21]:
df

Unnamed: 0,text_type,text
0,ham,harvard business school pub order confirmation...
1,spam,hello you can make upto $11000 worth of btc di...
2,spam,100 free hardcore megasite 100 free porn what ...
3,spam,𝑰 𝒔𝒂𝒘 𝒕𝒆𝒔𝒕𝒊𝒎𝒐𝒏𝒊𝒆𝒔 𝒐𝒇 𝒉𝒐𝒘 mrs heatherjgilbert 𝒉...
4,spam,the most comprehensive adult match making serv...
...,...,...
9995,ham,new resume dear vince i am so grateful for you...
9996,ham,use perl daily headline mailer announcing url ...
9997,ham,tony hamilton chris e hired tony to support gl...
9998,ham,fw fea announces the release of energy 2 1 chr...


### Testing

In [33]:
df_test = pd.read_csv("dataset/spam_test.csv")

In [57]:
## Actual Value (SPAM, HAM)  ## Y - Predicted Value (SPAM, HAM)
conf_matrix = np.zeros((2,2))

In [58]:
for i in range(len(df_test)):
    input_list = df_test.iloc[i,1]
    tok_input = tokenizer(input_list, return_tensors="pt", padding=True).to(device)
    out = model(tok_input)
    
    index_1 = torch.argmax(out[0]).tolist()
    
    if df_test.iloc[i,0] == "spam":
        index_2 = 0
    else:
        index_2 = 1
        
    conf_matrix[index_1][index_2] += 1

In [59]:
conf_matrix

array([[446.,   8.],
       [ 54., 492.]])

In [60]:
torch.save(model.state_dict(), "weight_trained_1.pth")