In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import pandas as pd
import numpy as np
import string
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, random_split, Dataset
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gc

In [2]:
torch.cuda.empty_cache()
gc.collect()

0

In [3]:
train = pd.read_pickle('Preprocessing_Train.pkl')
test = pd.read_csv('datasolve-us/test.csv')
new = pd.read_csv('new.csv')

stop_words = stopwords.words('english')
non_neg_stop = [i for i in stop_words if "n't" not in i and "no" not in i]
lemma = WordNetLemmatizer()


def pos(x):
    if x.startswith('J'):
        return 'a'
    elif x.startswith('V'):
        return 'v'
    elif x.startswith('R'):
        return 'r'
    else:
        return 'n'
    
    
def tok(doc, non_neg=False, lemmatized=False):
    
    if non_neg:
        stop = non_neg_stop
    else:
        stop = stop_words
       
    tokens = re.split(r'\s', doc.lower())
    
    if lemmatized:
        pos_list = nltk.pos_tag(tokens)
        tokens = list(map(lambda x: lemma.lemmatize(x[0], pos(x[1])), pos_list))    

    tokens = [i for i in tokens if i not in stop]
    tokens = [i for i in tokens if len(re.findall(r'\w', i)) >= 2]
    tokens = [re.findall(r"\w[a-zA-Z0-9.-]*\w", i)[0] for i in tokens if re.findall(r"\w[a-zA-Z0-9.-]*\w", i)]
    
    return tokens


def get_voc_new(X, non_neg=True, lemmatized=True):
    
    voc = []
    token_list = []
    for i in X:
        tokens = tok(i, non_neg, lemmatized)
        token_list.append(tokens)
        voc += tokens
        
    voc = list(set(voc))
    
    return voc, token_list

In [4]:
voc, tk_list_train = get_voc_new(train.document_text, False, True)
voc_test, tk_list_test = get_voc_new(test.document_text, False, True)
voc_dic = {item: idx+1 for idx, item in enumerate(voc)}

def word2idx(token_list):
    
    token_idx = []
    for sent in token_list:
        sent_list = []
        for token in sent:
            if token in voc_dic:
                idx = voc_dic[token]
            else:
                idx = 0
            sent_list.append(idx)
        token_idx.append(sent_list)

    return token_idx


tk_idx_train = word2idx(tk_list_train)
tk_idx_test = word2idx(tk_list_test)

In [5]:
class dataset_(Dataset):
    
    def __init__(self, X, y, target, max_len=1024):
        super(dataset_, self).__init__()
        pad = [i[:max_len] + [0]*(max_len-len(i[:max_len])) for i in X]
        self.X = torch.tensor(pad, dtype=torch.int32)
        self.y = torch.Tensor(np.array(list(y))[:, target])
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return len(self.X)
    
hard = [8, 19, 35, 41]   
ez = [i for i in range(50) if i not in hard]
train_ez = dataset_(tk_idx_train, train.label, ez)
train_hard = dataset_(tk_idx_train, train.label, hard)

In [12]:
class Emb_1(nn.Module):
    
    def __init__(self, voc_size, dim_emb):
        super(Emb_1, self).__init__()
        self.emb = nn.Sequential(
            nn.Embedding(voc_size, dim_emb, padding_idx=0),
            nn.Linear(dim_emb, 256),
            nn.ReLU())

    def forward(self, x):
        return self.emb(x)
    
class Emb_2(nn.Module):
    
    def __init__(self, voc_size, dim_emb):
        super(Emb_2, self).__init__()
        self.emb = nn.Sequential(
            nn.Embedding(voc_size, dim_emb, padding_idx=0),
            nn.Linear(dim_emb, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU())

    def forward(self, x):
        return self.emb(x)

    
class Conv1(nn.Module):
    
    def __init__(self, dim_emb, dim_out):
        super(Conv1, self).__init__()
        self.cv1 = nn.Sequential(
            nn.Conv1d(dim_emb, dim_out, 1),
            nn.BatchNorm1d(dim_out),
            nn.ReLU(),
            nn.MaxPool1d(3, 2))
        self.cv2 = nn.Sequential(
            nn.Conv1d(dim_emb, dim_out, 2),
            nn.BatchNorm1d(dim_out),
            nn.ReLU(),
            nn.MaxPool1d(3, 2)) 
        self.cv3 = nn.Sequential(
            nn.Conv1d(dim_emb, dim_out, 3),
            nn.BatchNorm1d(dim_out),
            nn.ReLU(),
            nn.MaxPool1d(2, 2))

    def forward(self, x):
        C1 = self.cv1(x.transpose(-2, -1)).transpose(-2, -1)
        C2 = self.cv2(x.transpose(-2, -1)).transpose(-2, -1)
        C3 = self.cv3(x.transpose(-2, -1)).transpose(-2, -1)
        return torch.cat([C1,C2,C3], dim=-1)
    
            
class Att(nn.Module):
    
    def __init__(self, dim_emb=256, dim_enc=128):
        super(Att, self).__init__()
        self.Wq = nn.Linear(dim_emb, dim_enc, bias=False)
        self.Wk = nn.Linear(dim_emb, dim_emb, bias=False)
        self.Wv = nn.Linear(dim_emb, dim_emb, bias=False)

    def forward(self, x):
        K = self.Wk(x)
        V = self.Wv(x)
        temp = self.Wq(K) 
        score = temp.transpose(-2, -1) / math.sqrt(K.size(-1)) ## (50, 1200)
        return torch.matmul(nn.Softmax(dim=-1)(score), V)


class ez_model(nn.Module):
    
    def __init__(self, voc_size=len(voc)+1, dim_emb=256, dropout=0.5, max_len=1024):
        super(ez_model, self).__init__()
        self.net = nn.Sequential(
            Emb_1(voc_size, dim_emb),  ## (1024, dim_emb)
            Conv1(256, 256),
            nn.Dropout(dropout),
            Att(768, 46),
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid())
            
    def forward(self, x):
        return self.net(x)
    
class comp_model(nn.Module):
    
    def __init__(self, voc_size=len(voc)+1, dim_emb=256, dropout=0.5, max_len=1024):
        super(comp_model, self).__init__()
        self.net = nn.Sequential(
            Emb_2(voc_size, dim_emb),  ## (1024, dim_emb)
            Conv1(256, 256),
            nn.Dropout(dropout),
            Att(768, 4),
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid())
            
    def forward(self, x):
        return self.net(x)

In [13]:
def train_model(model_A, model_B, train_dataset_A, train_dataset_B, device,
                lr=0.0005, epochs_A=50, epochs_B=50, batch_size=256):

    
    train_loader_A = DataLoader(train_dataset_A, batch_size=batch_size, shuffle=True)
    train_loader_B = DataLoader(train_dataset_B, batch_size=batch_size, shuffle=True)

    model_A = model_A.to(device)
    model_B = model_B.to(device)
    Loss = nn.BCELoss().to(device)
    op_A = torch.optim.Adam(model_A.parameters(), lr=lr)
    op_B = torch.optim.Adam(model_B.parameters(), lr=lr)
    
    print('Training start!')
    start = time.time()
    
    for epoch in range(epochs_A):
        
        model_A.train()

        for X, y in train_loader_A:
            X = X.to(device)
            y = y.to(device)
            out = model_A(X).squeeze(-1)
            loss = Loss(out, y)
            
            op_A.zero_grad()
            loss.backward()
            op_A.step()
    
    print('model A training complete!')
    print(f'Spent time: {time.time()-start} seconds')
      
    for epoch in range(epochs_B):
        
        model_B.train()

        for X, y in train_loader_B:
            X = X.to(device)
            y = y.to(device)
            out = model_B(X).squeeze(-1)
            loss = Loss(out, y)
            
            op_B.zero_grad()
            loss.backward()
            op_B.step()
            
    print('model B training complete!')
    print(f'Spent time: {time.time()-start} seconds')
            
    
    return None

In [None]:
model_A = ez_model()
model_B = comp_model()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs_A = 200
epochs_B = 200
lr = 3e-4
batch_size = 128

train_model(model_A, model_B, train_ez, train_hard, device, lr, epochs_A, epochs_B, batch_size)

Training start!
model A training complete!
Spent time: 2262.119045495987 seconds


In [None]:
pad_test = [i[:1024] + [0]*(1024-len(i[:1024])) for i in tk_idx_test]
test_ = torch.tensor(pad_test, dtype=torch.int32)

In [None]:
test_ = test_.cpu()
model_A = model_A.cpu()
model_B = model_B.cpu()
res_A = []
res_B = []
with torch.no_grad():
    for i in DataLoader(test_, batch_size=256, shuffle=False):
        res_A.append(model_A(i).cpu())
        res_B.append(model_B(i).cpu())

In [None]:
fin_A = torch.cat(res_A, dim=0).squeeze(-1)
fin_B = torch.cat(res_B, dim=0).squeeze(-1)

In [None]:
final = torch.zeros(4993,50)
final[:, ez] = fin_A
final[:, hard] = fin_B
fin = np.round(final.view(-1))

In [None]:
pd.DataFrame(fin, columns=['predictions']).to_csv('bailanle.csv')