In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import pandas as pd
import numpy as np
import string
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, random_split, Dataset
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
## Initialize needed variables and set up pipelines for tokenizations

train = pd.read_pickle('Preprocessing_Train.pkl')
tst = pd.read_csv('datasolve-us/test.csv')
# X_train, X_test, y_train, y_test = train_test_split(train.document_text, train.label, test_size=0.25)

stop_words = stopwords.words('english')
non_neg_stop = [i for i in stop_words if "n't" not in i and "no" not in i]
lemma = WordNetLemmatizer()


def pos(x):
    if x.startswith('J'):
        return 'a'
    elif x.startswith('V'):
        return 'v'
    elif x.startswith('R'):
        return 'r'
    else:
        return 'n'
    
    
def tok(doc, non_neg=False, lemmatized=False):
    
    if non_neg:
        stop = non_neg_stop
    else:
        stop = stop_words
       
    tokens = re.split(r'\s', doc.lower())
    
    if lemmatized:
        pos_list = nltk.pos_tag(tokens)
        tokens = list(map(lambda x: lemma.lemmatize(x[0], pos(x[1])), pos_list))    

    tokens = [i for i in tokens if i not in stop]
    tokens = [i for i in tokens if len(re.findall(r'\w', i)) >= 2]
    tokens = [re.findall(r"\w[a-zA-Z0-9.-]*\w", i)[0] for i in tokens if re.findall(r"\w[a-zA-Z0-9.-]*\w", i)]
    
    return tokens


def get_voc_new(X, non_neg=True, lemmatized=True):
    
    voc = []
    token_list = []
    for i in X:
        tokens = tok(i, non_neg, lemmatized)
        token_list.append(tokens)
        voc += tokens
        
    voc = list(set(voc))
    
    return voc, token_list

In [3]:
voc_final, _ = get_voc_new(train.document_text, False, True)
voc_tst, _ = get_voc_new(tst.document_text, False, True)
tfidf = TfidfVectorizer(min_df=10, vocabulary=voc_final)
vec_train = tfidf.fit_transform(train.document_text)
vec_test = tfidf.transform(tst.document_text)

In [43]:
threshold = 0.3
ok_idx = torch.Tensor(vec_train.toarray()).max(0).values>threshold
advoc_size = ok_idx.int().sum().item()

In [44]:
class _dataset(Dataset):
    
    def __init__(self, X, y):
        super(_dataset, self).__init__()
        self.X = torch.Tensor(X.toarray())[:, ok_idx]
        self.y = torch.Tensor(y)
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return len(self.X)
    
_train = _dataset(vec_train, np.array(list(train.label)))

In [45]:
class MLP_model(nn.Module):
    
    def __init__(self):

        super(MLP_model, self).__init__()
        
        self.net = nn.Sequential(
            nn.Linear(advoc_size, 500),
            nn.ReLU(),
            nn.Linear(500, 50),
            nn.Sigmoid())

            
    def forward(self, x):
        return self.net(x)

In [46]:
def gg(model, train_dataset, device, norm=0.5,
                lr=0.0005, epochs=50, batch_size=256):
    

    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model = model.to(device)
    
    if norm:
        Loss = nn.BCELoss(weight=train_.y.sum(axis=0)**-norm).to(device)

    else:
        Loss = nn.BCELoss().to(device)
        
    op = torch.optim.Adam(model.parameters(), lr=lr)
    
    print('Training start!')
    start = time.time()
    
    for epoch in range(epochs):
        
        model.train()
        
        for X, y in train_loader:
            X = X.to(device)
            y = y.to(device)
            out = model(X)
            loss = Loss(out, y)
            
            op.zero_grad()
            loss.backward()
            op.step()
            

            
        
    print('Training complete!')
    
    return None

In [47]:
model_A = MLP_model()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 100
lr = 3e-4
batch_size = 128
norm = None
hist = gg(model_A, _train, device, norm,
                lr=lr, epochs=epochs, batch_size=batch_size)

Training start!
Training complete!


In [48]:
xxx = torch.Tensor(vec_test.toarray())[:, ok_idx].to(device)

with torch.no_grad():
    res = model_A(xxx)

rr = np.round(res.view(-1).cpu())
pd.DataFrame(rr, columns=['predictions']).to_csv('tfidf_my.csv')