In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import pandas as pd
import numpy as np
import string
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, random_split, Dataset
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
import gc

In [None]:
## Initialize needed variables and set up pipelines for tokenizations

ori = pd.read_csv('datasolve-us/train.csv')
train = pd.read_pickle('Preprocessing_Train.pkl')

t1, t2, _, _ = train_test_split(ori.id, ori.cat_name, test_size=0.75, stratify=ori.cat_name)
train_idx = [i for i in range(len(train)) if train['id'][i] in t1.unique()]
test_idx = [i for i in range(len(train)) if i not in train_idx]

X_train = train['document_text'][train_idx]
y_train = train['label'][train_idx]
X_test = train['document_text'][test_idx]
y_test = train['label'][test_idx]

stop_words = stopwords.words('english')
non_neg_stop = [i for i in stop_words if "n't" not in i and "no" not in i]
lemma = WordNetLemmatizer()


def pos(x):
    if x.startswith('J'):
        return 'a'
    elif x.startswith('V'):
        return 'v'
    elif x.startswith('R'):
        return 'r'
    else:
        return 'n'
    
    
def tok(doc, non_neg=False, lemmatized=False):
    
    if non_neg:
        stop = non_neg_stop
    else:
        stop = stop_words
       
    tokens = re.split(r'\s', doc.lower())
    
    if lemmatized:
        pos_list = nltk.pos_tag(tokens)
        tokens = list(map(lambda x: lemma.lemmatize(x[0], pos(x[1])), pos_list))    

    tokens = [i for i in tokens if i not in stop]
    tokens = [i for i in tokens if len(re.findall(r'\w', i)) >= 2]
    tokens = [re.findall(r"\w[a-zA-Z0-9.-]*\w", i)[0] for i in tokens if re.findall(r"\w[a-zA-Z0-9.-]*\w", i)]
    
    return tokens


def get_voc_new(X, non_neg=True, lemmatized=True):
    
    voc = []
    token_list = []
    for i in X:
        tokens = tok(i, non_neg, lemmatized)
        token_list.append(tokens)
        voc += tokens
        
    voc = list(set(voc))
    
    return voc, token_list

In [None]:
## tokenize train set and test set, create a vocabulary-index dictionary only based on train set

voc, tk_list_train = get_voc_new(X_train, False, True)
# voc, tk_list_train = get_voc_new(train.document_text, False, True)
voc_test, tk_list_test = get_voc_new(X_test, False, True)
voc_dic = {item: idx+1 for idx, item in enumerate(voc)}

In [None]:
## Convert tokens to index according to the dictionary we create in the last step 
## The unfound tokens would be padded with 0

def word2idx(token_list):
    
    token_idx = []
    for sent in token_list:
        sent_list = []
        for token in sent:
            if token in voc_dic:
                idx = voc_dic[token]
            else:
                idx = 0
            sent_list.append(idx)
        token_idx.append(sent_list)

    return token_idx


tk_idx_train = word2idx(tk_list_train)
tk_idx_test = word2idx(tk_list_test)

In [None]:
## Create a Pytorch Dataset class to convert data to tensors
## Each token index list is set to the same length, 1024 by default
## The shorter one would be padded with 0, and the longer one would be truncated

class dataset_(Dataset):
    
    def __init__(self, X, y, max_len=1024):
        super(dataset_, self).__init__()
        pad = [i[:max_len] + [0]*(max_len-len(i[:max_len])) for i in X]
        self.X = torch.tensor(pad, dtype=torch.int32)
        self.y = torch.Tensor(np.array(list(y)))
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return len(self.X)
    
    
train_ = dataset_(tk_idx_train, y_train)
# train_ = dataset_(tk_idx_train, train.label)
test_ = dataset_(tk_idx_test, y_test)

In [None]:
class Emb(nn.Module):
    
    def __init__(self, voc_size, dim_emb):
        super(Emb, self).__init__()
        self.emb = nn.Sequential(
            nn.Embedding(voc_size, dim_emb, padding_idx=0),
            nn.Linear(dim_emb, 256),
            nn.ReLU())

    def forward(self, x):
        return self.emb(x).transpose(-2, -1)

    
class Conv(nn.Module):
    
    def __init__(self, max_len, dim_emb, dim_out):
        super(Conv, self).__init__()
        self.cv = nn.Sequential(
            nn.Conv1d(dim_emb, dim_out, 3),
            nn.BatchNorm1d(dim_out),
            nn.ReLU(),
            nn.MaxPool1d(3, 2))

    def forward(self, x):
        return self.cv(x).transpose(-2, -1)

            
class Att(nn.Module):
    
    def __init__(self, dim_emb=256, dim_enc=128):
        super(Att, self).__init__()
        self.Wq = nn.Linear(dim_emb, dim_enc, bias=False)
        self.Wk = nn.Linear(dim_emb, dim_emb, bias=False)
        self.Wv = nn.Linear(dim_emb, dim_emb, bias=False)

    def forward(self, x):
        K = self.Wk(x) 
        V = self.Wv(x)
        temp = self.Wq(K) 
        score = temp.transpose(-2, -1) / math.sqrt(K.size(-1)) ## (50, 1200)
        return torch.matmul(nn.Softmax(dim=-1)(score), V)
    
    
class LN(nn.Module):
    
    def __init__(self, dim_emb=256):
        super(LN, self).__init__()
        self.feed = nn.LayerNorm(dim_emb)

    def forward(self, x):
        return x + self.feed(x)


class final_model(nn.Module):
    
    def __init__(self, voc_size=len(voc)+1, dim_emb=256, dropout=0.5, max_len=1024):
        super(final_model, self).__init__()
        self.net = nn.Sequential(
            Emb(voc_size, dim_emb),  ## (1024, dim_emb)
            Conv(max_len, dim_emb, 256), ## (256,)
            nn.Dropout(dropout),
            Att(256, 50),  ## (50, 256)
            # Att(256, 50),
            nn.Linear(256, 256),
            # LN(dim_emb),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid())
            
    def forward(self, x):
        return self.net(x)

In [None]:
def train_model(model, train_dataset, eval_dataset, device, norm=0.5,
                lr=0.0005, epochs=50, batch_size=256):
    
    history = {'train_loss': [], 'eval_loss': [], 'detail_train': [], 'detail_eval': []}
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

    model = model.to(device)
    save_loss = nn.BCELoss(reduction='none').to(device)
    
    if norm:
        Loss = nn.BCELoss(weight=train_.y.sum(axis=0)**-norm).to(device)

    else:
        Loss = nn.BCELoss().to(device)
        
    op = torch.optim.Adam(model.parameters(), lr=lr)
    
    print('Training start!')
    start = time.time()
    
    for epoch in range(epochs):
        
        model.train()
        train_loss = 0
        eval_loss = 0
        detail_train = torch.zeros(50).to(device)
        detail_eval = torch.zeros(50).to(device)
        pred = []
        real = []
        
        for X, y in train_loader:
            X = X.to(device)
            y = y.to(device)
            out = model(X).squeeze(-1)
            loss = Loss(out, y)
            save_train = save_loss(out, y).sum(0)
            
            op.zero_grad()
            loss.backward()
            op.step()
            
            train_loss += loss
            detail_train += save_train
            
        model.eval()
        with torch.no_grad():
            for X, y in eval_loader:
                X = X.to(device)
                y = y.to(device)
                out = model(X).squeeze(-1)
                loss = Loss(out, y)
                save_eval = save_loss(out, y).sum(0)
                detail_eval += save_eval
                eval_loss += loss
                pred.append(out.cpu())
                real.append(y.cpu())
                
        train_loss = (train_loss/len(train_loader)).item()
        eval_loss = (eval_loss/len(eval_loader)).item() 
        
        history['train_loss'].append(train_loss)
        history['eval_loss'].append(eval_loss)  
        history['detail_train'].append(detail_train.cpu().detach())
        history['detail_eval'].append(detail_eval.cpu().detach())  
        
        if not (epoch+1)%10:
            print(f"epoch {epoch+1}\ntrain loss: {train_loss}\t\teval loss: {eval_loss}")
        
        if not (epoch+1)%50:
            res = torch.cat(pred)
            tru = torch.cat(real)
            print(f'\nepoch {epoch+1}:\n')
            print(f"f1_score for 50 classes: {f1_score(tru, np.round(res), average='macro')}")
            print(classification_report(tru, np.round(res)))
            print(f'\nSpent time: {time.time()-start} seconds')
            
        
    print('Training complete!')
    
    return history

In [None]:
## Reset memories before each run
## Can delete
torch.cuda.empty_cache()
gc.collect()

## Run
mod = final_model()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 200
lr = 3e-4
batch_size = 128
norm = None

hist = train_model(mod, train_, test_, device, norm,
                lr=lr, epochs=epochs, batch_size=batch_size)

plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.plot(hist['train_loss'], label='Train')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.plot(hist['eval_loss'], label='Eval')
plt.legend();