In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import pandas as pd
import numpy as np
import string
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, random_split, Dataset
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification

In [2]:
train = pd.read_pickle('Preprocessing_Train.pkl')
stop_words = stopwords.words('english')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained(
    "bert-base-uncased",
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.       
    )

def bert_tokenize(doc, tokenizer=tokenizer):
    
    tokens = re.split(r'\s', doc.lower())
    tokens = [i for i in tokens if i not in stop_words]
    tokens = [i for i in tokens if len(re.findall(r'\w', i)) >= 2]
    tokens = [re.findall(r"\w[a-zA-Z0-9.-]*\w", i)[0] for i in tokens if re.findall(r"\w[a-zA-Z0-9.-]*\w", i)]
    result = []
    for i in tokens:
        result += tokenizer.convert_tokens_to_ids(tokenizer.tokenize(i))
        
    return result

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
X = train.document_text.apply(bert_tokenize)

In [3]:
class dataset_(Dataset):
    
    def __init__(self, X, y, max_len=1200):
        super(dataset_, self).__init__()
        pad = [i[:max_len] + [0]*(max_len-len(i[:max_len])) for i in X]
        self.X = torch.tensor(pad, dtype=torch.int32)
        self.y = torch.Tensor(np.array(list(y)))
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return len(self.X)
    
    
train_ = dataset_(X, train.label)

NameError: name 'X' is not defined

In [6]:
pretrained_layer = bert_model.get_input_embeddings()

for param in pretrained_layer.parameters():
    param.requires_grad=False

# class PE(nn.Module): 

#     def __init__(self, dim_emb=256, dropout=0.5, max_len=1024): 
#         super(PE, self).__init__() 
#         self.dropout = nn.Dropout(dropout) 
#         pe = torch.zeros(max_len, dim_emb) 
#         position = torch.arange(0, max_len).unsqueeze(1) 
#         div_term = torch.exp(-math.log(10000) * torch.arange(0, dim_emb, 2)/dim_emb)
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term) 
#         pe = pe.unsqueeze(0) 
#         self.register_buffer('pe', pe) 
        
#     def forward(self, x): 
#         x = x + Variable(self.pe[:, :], requires_grad=False) 
#         return self.dropout(x) 

    
class Att(nn.Module):
    
    def __init__(self, dim_emb=512):
        super(Att, self).__init__()
        self.Wq = nn.Linear(dim_emb, 50, bias=False)
        self.Wk = nn.Linear(768, dim_emb, bias=False)
        self.Wv = nn.Linear(768, dim_emb, bias=False)

    def forward(self, x): ## (1200, 768)
        K = self.Wk(x) ## (1200, dim_emb)
        V = self.Wv(x)
        temp = self.Wq(K) ## (1200, 50)
        score = temp.transpose(-2, -1) / math.sqrt(K.size(-1)) ## (50, 1200)
        return torch.matmul(nn.Softmax(dim=-1)(score), V)
    
    
class LN(nn.Module):
    
    def __init__(self, dim_emb=512):
        super(LN, self).__init__()
        self.feed = nn.LayerNorm(dim_emb)

    def forward(self, x):
        return x + self.feed(x)


class final_model(nn.Module):
    
    def __init__(self, dim_emb=512, dropout=0.3):
        super(final_model, self).__init__()
        self.net = nn.Sequential(
            pretrained_layer,
            # PE(dim_emb, dropout, max_len),  ## (1200, 768)
            # nn.Dropout(dropout),
            Att(dim_emb),  ## (50, dim_emb)
            LN(dim_emb),
            nn.Linear(dim_emb, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid())
            
    def forward(self, x):
        return self.net(x)

In [9]:
def train_model(model, train_dataset, device, norm=0.5,
                lr=0.0005, epochs=50, batch_size=256):
    
    history = {'train_loss': []}
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model = model.to(device)
    if norm:
        Loss = nn.BCELoss(weight=train_.y.sum(axis=0)**-norm).to(device)
    else:
        Loss = nn.BCELoss().to(device)
    op = torch.optim.Adam(model.parameters(), lr=lr)
    
    print('Training start!')
    start = time.time()
    
    for epoch in range(epochs):
        
        model.train()
        train_loss = 0
        eval_loss = 0
        pred = []
        real = []
        
        for X, y in train_loader:
            X = X.to(device)
            y = y.to(device)
            out = model(X).squeeze(-1)
            loss = Loss(out, y)
            
            op.zero_grad()
            loss.backward()
            op.step()
            
            train_loss += loss
            
                
        train_loss = (train_loss/len(train_loader)).item()

        
        history['train_loss'].append(train_loss)

            
        
    print('Training complete!')
    print(f'\nSpent time: {time.time()-start} seconds')
    
    
    return history

In [7]:
mod = final_model(dim_emb=384)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
norm = None
epochs = 150
lr = 0.00027
batch_size = 128
# weight_decay = 0.0003
hist = train_model(mod, train_, device, norm,
                lr=lr, epochs=epochs, batch_size=batch_size)

Training start!
Training complete!


In [4]:
tst = pd.read_csv('datasolve-us/test.csv')
X_test = tst.document_text.apply(bert_tokenize)
max_len = 1200
pad = [i[:max_len] + [0]*(max_len-len(i[:max_len])) for i in X_test]
X_final = torch.tensor(pad, dtype=torch.int32)

In [7]:
mod = torch.load('bert.pt')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mod = mod.to(device)
X = X_final.to(device)

In [14]:
mod.cpu()
X_final.cpu()
res = []
with torch.no_grad():
    for i in DataLoader(X_final, batch_size=100, shuffle=False):
        res.append(mod(i).squeeze(-1).view(-1))

In [19]:
res = np.round(torch.concat(res))

In [30]:
# torch.save(mod, 'bert.pt')

In [21]:
t1 = pd.read_csv('try_1.csv')
t2 = pd.read_csv('try_2.csv')

In [25]:
yx = torch.Tensor(np.array(t1.predictions))
lj = torch.Tensor(np.array(t2.predictions))

In [27]:
(yx==res).float().mean()

tensor(0.9681)

In [28]:
(lj==res).float().mean()

tensor(0.9549)

In [29]:
(lj==yx).float().mean()

tensor(0.9474)

In [32]:
pd.DataFrame(res, columns=['predictions']).to_csv('try_3.csv')