In [8]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import pandas as pd
import numpy as np
import string
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, random_split, Dataset
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [9]:
train = pd.read_pickle('Preprocessing_Train.pkl')
test = pd.read_csv('datasolve-us/test.csv')
new = pd.read_csv('new.csv')

stop_words = stopwords.words('english')
non_neg_stop = [i for i in stop_words if "n't" not in i and "no" not in i]
lemma = WordNetLemmatizer()


def pos(x):
    if x.startswith('J'):
        return 'a'
    elif x.startswith('V'):
        return 'v'
    elif x.startswith('R'):
        return 'r'
    else:
        return 'n'
    
    
def tok(doc, non_neg=False, lemmatized=False):
    
    if non_neg:
        stop = non_neg_stop
    else:
        stop = stop_words
       
    tokens = re.split(r'\s', doc.lower())
    
    if lemmatized:
        pos_list = nltk.pos_tag(tokens)
        tokens = list(map(lambda x: lemma.lemmatize(x[0], pos(x[1])), pos_list))    

    tokens = [i for i in tokens if i not in stop]
    tokens = [i for i in tokens if len(re.findall(r'\w', i)) >= 2]
    tokens = [re.findall(r"\w[a-zA-Z0-9.-]*\w", i)[0] for i in tokens if re.findall(r"\w[a-zA-Z0-9.-]*\w", i)]
    
    return tokens


def get_voc_new(X, non_neg=True, lemmatized=True):
    
    voc = []
    token_list = []
    for i in X:
        tokens = tok(i, non_neg, lemmatized)
        token_list.append(tokens)
        voc += tokens
        
    voc = list(set(voc))
    
    return voc, token_list

In [10]:
voc, tk_list_train = get_voc_new(train.document_text, False, True)
voc_test, tk_list_test = get_voc_new(test.document_text, False, True)
voc_dic = {item: idx+1 for idx, item in enumerate(voc)}

def word2idx(token_list):
    
    token_idx = []
    for sent in token_list:
        sent_list = []
        for token in sent:
            if token in voc_dic:
                idx = voc_dic[token]
            else:
                idx = 0
            sent_list.append(idx)
        token_idx.append(sent_list)

    return token_idx


tk_idx_train = word2idx(tk_list_train)
tk_idx_test = word2idx(tk_list_test)

In [36]:
tfidf = TfidfVectorizer(vocabulary=voc)
vec_train = tfidf.fit_transform(train.document_text)

class _dataset(Dataset):
    
    def __init__(self, X, y):
        super(_dataset, self).__init__()
        self.X = torch.Tensor(X)
        self.y = torch.Tensor(y)
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return len(self.X)

class dataset_(Dataset):
    
    def __init__(self, X, y, max_len=1024):
        super(dataset_, self).__init__()
        pad = [i[:max_len] + [0]*(max_len-len(i[:max_len])) for i in X]
        self.X = torch.tensor(pad, dtype=torch.int32)
        self.y = torch.Tensor(np.array(list(y)))
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return len(self.X)
    
    
train_ = dataset_(tk_idx_train, train.label)
_train = _dataset(vec_train.toarray(), np.array(list(train.label)))

In [12]:
class MLP_model(nn.Module):
    
    def __init__(self, dim_target=50):

        super(MLP_model, self).__init__()
        
        self.net = nn.Sequential(
            nn.Linear(vec_train.shape[1], 500),
            nn.ReLU(),
            nn.Linear(500, dim_target),
            nn.Sigmoid())

            
    def forward(self, x):
        return self.net(x)

class Att(nn.Module):
    
    def __init__(self, dim_emb=256, dim_target=50):
        super(Att, self).__init__()
        self.Wq = nn.Linear(dim_emb, dim_target, bias=False)
        self.Wk = nn.Linear(dim_emb, dim_emb, bias=False)
        self.Wv = nn.Linear(dim_emb, dim_emb, bias=False)

    def forward(self, x):
        K = self.Wk(x) 
        V = self.Wv(x)
        temp = self.Wq(K) 
        score = temp.transpose(-2, -1) / math.sqrt(K.size(-1)) ## (50, 1200)
        return torch.matmul(nn.Softmax(dim=-1)(score), V)


class final_model(nn.Module):
    
    def __init__(self, voc_size=len(voc)+1, dim_emb=256, dim_target=50):
        super(final_model, self).__init__()
        self.net = nn.Sequential(
            nn.Embedding(voc_size, dim_emb, padding_idx=0),
            nn.Dropout(0.5),
            Att(dim_emb, dim_target),  ## (50, dim_emb)
            nn.Linear(dim_emb, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid())
            
    def forward(self, x):
        return self.net(x)

In [17]:
def train_model(model_A, model_B, train_dataset_A, train_dataset_B, device, A_idx, B_idx,
                lr=0.0005, epochs_A=50, epochs_B=50, batch_size=256):

    
    train_loader_A = DataLoader(train_dataset_A, batch_size=batch_size, shuffle=True)
    train_loader_B = DataLoader(train_dataset_B, batch_size=batch_size, shuffle=True)

    model_A = model_A.to(device)
    model_B = model_B.to(device)
    Loss = nn.BCELoss().to(device)
    op_A = torch.optim.Adam(model_A.parameters(), lr=lr)
    op_B = torch.optim.Adam(model_B.parameters(), lr=lr)
    
    print('Training start!')
    start = time.time()
    
    for epoch in range(epochs_A):
        
        model_A.train()

        for X, y in train_loader_A:
            X = X.to(device)
            y = y[:, A_idx].to(device)
            out = model_A(X)
            loss = Loss(out, y)
            
            op_A.zero_grad()
            loss.backward()
            op_A.step()
      
    for epoch in range(epochs_B):
        
        model_B.train()

        for X, y in train_loader_B:
            X = X.to(device)
            y = y[:, B_idx].to(device)
            out = model_B(X).squeeze(-1)
            loss = Loss(out, y)
            
            op_B.zero_grad()
            loss.backward()
            op_B.step()
            
            
    print('Training complete!')
    print(f'Spent time: {time.time()-start} seconds')
    
    return None

In [32]:
target = new.sort_values('word_emb_loss', ascending=False).reset_index(drop=True).label_idx[:10]
target = torch.tensor(np.array(list(target)), dtype=torch.long)
normal = [i for i in range(50) if i not in target]

In [33]:
model_A = MLP_model(dim_target=40)
model_B = final_model(dim_target=10)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs_A = 100
epochs_B = 100
A_idx = normal
B_idx = target
lr = 3e-4
batch_size = 128

train_model(model_A, model_B, _train, train_, device, A_idx, B_idx,
                lr, epochs_A, epochs_B, batch_size)

Training start!
Training complete!
Spent time: 272.9622218608856 seconds


In [37]:
pad_test = [i[:1024] + [0]*(1024-len(i[:1024])) for i in tk_idx_test]
test_ = torch.tensor(pad_test, dtype=torch.int32)
vec_test = tfidf.transform(test.document_text)

In [38]:
test_ = test_.cpu()
# model_A = model_A.cpu()
model_B = model_B.cpu()
# res_A = []
res_B = []
with torch.no_grad():
    for i in DataLoader(test_, batch_size=256, shuffle=False):
#         res_A.append(model_A(i).cpu())
        res_B.append(model_B(i).cpu())
    
xxx = torch.Tensor(vec_test.toarray()).to(device)
with torch.no_grad():
    res_A = model_A(xxx)

rr = res_A.cpu()

In [39]:
# fin_A = torch.cat(res_A, dim=0).squeeze(-1)
fin_B = torch.cat(res_B, dim=0).squeeze(-1)

In [40]:
final = torch.zeros(4993,50)
final[:, normal] = rr
final[:, target] = fin_B

fin = np.round(final.view(-1))

In [41]:
tfidf = pd.read_csv('tfidf.csv').predictions
att = pd.read_csv('att_3.csv').predictions

In [42]:
(np.array(tfidf) == np.array(fin)).mean()

0.9863328660124174

In [43]:
(np.array(att) == np.array(fin)).mean()

0.9662247146004406

In [44]:
pd.DataFrame(fin, columns=['predictions']).to_csv('class_2_re.csv')