Contains code for Model6, which uses an RNN

In [221]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from typing import List

RANDOM_STATE = 42069
NUM_WORDS = 5000
NUM_AUTHORS = 21246
MAX_LEN = 491

In [114]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [115]:
path = "../data/train.json"
train = load_data_set(path)
train.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


## Preprocessing


first we efficiently perform the text-transformation using pandas. Once completed, we can get into necessary format for training with `pytorch`, which is a dictionary with keys as labels, and list of examples

In [116]:
def preprocess(df: pd.DataFrame, train=True, drop_samples=False):
    
    df = df.copy(deep=True)
   
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # drops samples containing no prolific authors, Reduces training set by ~60% to 7000 samples
    if drop_samples:
        df["has target"] = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["has target"], axis=1)
        
    # text transormation
    df["text"] = df["title"] + df["abstract"]
    
    # drop unnecessarily long examples
    if train:
        df["len"] = df["text"].apply(lambda x: len(x))
        df = df[df["len"]<=491]

    # drop
    df = df.drop(["abstract", "title", "year", "coauthors", "venue", "len"], axis=1)
    return df

In [117]:
# feature transformation

def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)

In [118]:
df = preprocess(train, drop_samples=True)
df.head()

Unnamed: 0,target authors,text
0,"[42, 36]","[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,[45],"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
3,[97],"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,[2],"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."
9,"[44, 2]","[1560, 1694, 11, 1546, 11, 3066, 1728, 47, 160..."


In [119]:
df.shape

(7356, 2)

In [187]:
def torch_preprocessing(author: int, df:pd.DataFrame):
    # create dictionary of list of examples and labels
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    categories = {}
    categories[1] = df.loc[df['label'] == 1, 'text'].tolist()
    categories[0] = df.loc[df['label'] == 0, 'text'].tolist()
    return categories

In [188]:
# convert to OHE tensors
def textToTensor(text):
    tensor = torch.zeros(len(text), 1, NUM_WORDS)
    for wi, word in enumerate(text):
        tensor[wi][0][word] = 1
    return tensor

In [189]:
# we test results for a single author
test_author = 69
samples     = torch_preprocessing(test_author, df)

## Model

In [229]:
torch.manual_seed(RANDOM_STATE)

NUM_EPOCHS = 128
NUM_ITERS  = 5000
BATCH_SIZE = 64
DEVICE     = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
LEARNING_R = 0.005

EMBEDDING_DIM = 64
HIDDEN_DIM    = 128
NUM_CLASSES   = 2


PRINT_EVERY = 50000 # used for printing
PLOT_EVERY  = 100

In [144]:
# RNN Classifier from workshop 9
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size) 
        self.activation = nn.Tanh() 
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.activation(self.i2h(combined)) 
        output = self.h2o(hidden) 
        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [145]:
rnn = RNN(NUM_WORDS, HIDDEN_DIM, NUM_CLASSES)

In [152]:
# test a sample instance that the model is feeding correctly 
test = np.random.randint(5000, size=50)
test = textToTensor(test)
hidden = torch.zeros(1, HIDDEN_DIM)

In [151]:
output, next_input = rnn(test[0], hidden)
print(output)

tensor([[-0.7226, -0.6645]], grad_fn=<LogSoftmaxBackward>)


### Training

In [194]:
def classification(model_output):
    return torch.argmax(model_output).item()


def randomChoice(l):
    return l[np.random.randint(low=0, high=len(l)-1)]


def randomTrainingSample():
    class_label = randomChoice([0,1])    # as this binary classification
    sample      = randomChoice(samples[class_label])
    sample_tensor = textToTensor(sample)
    label_tensor  = torch.tensor([class_label])
    return label_tensor, sample_tensor

In [202]:
criterion = nn.NLLLoss()

def train(label_tensor, sample_tensor):
    
    hidden = rnn.initHidden()
    rnn.zero_grad()
    
    for i in range(sample_tensor.size()[0]):
        output, hidden = rnn(sample_tensor[i], hidden)
        
    loss = criterion(output, label_tensor)
    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-LEARNING_R)
    
    return output, loss.item()

In [None]:
# sample of training
import math
import time

current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()


for iteration in tqdm(range(1, NUM_ITERS + 1)):
    category_tensor, line_tensor = randomTrainingSample()
    
    category = category_tensor.item()
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iteration % PRINT_EVERY == 0:
        guess  = classification(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

    # Add current loss avg to list of losses
    if iteration % PLOT_EVERY == 0:
        all_losses.append(current_loss / PLOT_EVERY)
        current_loss = 0

 12%|█▏        | 605/5000 [01:29<15:46,  4.64it/s]

In [None]:
plt.figure()
plt.xlabel('Iterations')
plt.ylabel('Loss (Train)')
plt.plot(range(0,NUM_ITERS,PLOT_EVERY),all_losses)

**Training - Test Split & Resampling**

as we are building 100 classifiers, one for each author `0, ..., 99` we use the following function to build data sets for each author.

In [23]:
def get_train_test_split(author: int, df:pd.DataFrame, resampling=None):
    """
    """
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    X = df.drop(["label", "target authors"], axis=1)
    y = df["label"]
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

    if resampling: 
        X_train, y_train = resampling(X_train, y_train)

    return X_train, X_val, y_train, y_val

In [15]:
def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def downsample_training(X_train, y_train):
    """
    downasamples majority class until class balance is achieved 
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    neg_downsample = resample(neg, replace=True, n_samples=len(pos), random_state=RANDOM_STATE)
    
    resampled = pd.concat([pos, neg_downsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def resample_training(X_train, y_train):
    """
    resamples class imbalance using SMOTE: 
    https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
    """
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train