Contains code for Model6, which uses an RNN

In [252]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from typing import List

NUM_WORDS = 5000
NUM_AUTHORS = 21246
MAX_LEN = 491

In [253]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [254]:
path = "../data/train.json"
train = load_data_set(path)
train.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


## Preprocessing


first we efficiently perform the text-transformation using pandas. Once completed, we can get into necessary format for training with `pytorch`, which is a dictionary with keys as labels, and list of examples

In [255]:
def preprocess(df: pd.DataFrame, train=True, drop_samples=False):
    
    df = df.copy(deep=True)
   
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # drops samples containing no prolific authors, Reduces training set by ~60% to 7000 samples
    if drop_samples:
        df["has target"] = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["has target"], axis=1)
        
    # text transormation
    df["text"] = df["title"] + df["abstract"]
    
    # drop unnecessarily long examples
    if train:
        df["len"] = df["text"].apply(lambda x: len(x))
        df = df[df["len"]<=200]

    # drop
    df = df.drop(["abstract", "title", "year", "coauthors", "venue", "len"], axis=1)
    return df

In [256]:
# feature transformation

def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)

In [257]:
df = preprocess(train, drop_samples=True)
df.head()

Unnamed: 0,target authors,text
0,"[42, 36]","[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,[45],"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
3,[97],"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,[2],"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."
9,"[44, 2]","[1560, 1694, 11, 1546, 11, 3066, 1728, 47, 160..."


In [258]:
df.shape

(5339, 2)

In [259]:
def training_eval_split(author: int, df: pd.DataFrame, resampling=None):
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    X, y = df["text"], df["label"]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42069)
    
    if resampling:
        X_train, y_train = resampling(X_train, y_train)
    
    train = pd.concat([X_train, y_train], axis=1)
    val   = pd.concat([X_val, y_val], axis=1)
    return train, val

In [260]:
def torch_preprocessing(df:pd.DataFrame):
    # converts author data frame to dictionary format for pytorch
    categories = {}
    categories[1] = df.loc[df['label'] == 1, 'text'].tolist()
    categories[0] = df.loc[df['label'] == 0, 'text'].tolist()
    return categories

In [261]:
# we test results for a single author
test_author = 10
train, val = training_eval_split(test_author, df)

In [262]:
train["label"].value_counts()

0    3644
1      93
Name: label, dtype: int64

In [263]:
val["label"].value_counts()

0    1570
1      32
Name: label, dtype: int64

In [264]:
train_samples = torch_preprocessing(train)
val_samples   = torch_preprocessing(val)

## Model

In [287]:
# torch.manual_seed(RANDOM_STATE)

NUM_EPOCHS = 10
NUM_ITERS  = 15000
BATCH_SIZE = 64
DEVICE     = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

LEARNING_RATE = 0.001
EMBEDDING_DIM = 64
HIDDEN_DIM    = 32
NUM_CLASSES   = 2
CLASSES = [0, 1]

PLOT_EVERY  = 500

In [288]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size) # changed input
        self.activation = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)
        
        
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.activation(self.i2h(combined)) # changed to use activation
        output = self.h2o(hidden) # changed input
        output = self.softmax(output)
        return output, hidden
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [289]:
# convert to OHE tensors
def textToTensor(text):
    tensor = torch.zeros(len(text), 1, NUM_WORDS)
    for wi, word in enumerate(text):
        tensor[wi][0][word] = 1
    return tensor

In [290]:
# test a sample instance that the model is feeding correctly 
test = np.random.randint(5000, size=50)
test = textToTensor(test)
hidden = torch.zeros(1, HIDDEN_DIM)

In [291]:
output, next_input = rnn(test[0], hidden)
print(output)

tensor([[-0.6902, -0.6961]], grad_fn=<LogSoftmaxBackward>)


### Training

We now perform training for this specific `test_author` and print training loss to confirm that the model is learning 

In [292]:
import random

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return CLASSES[category_i], category_i


def randomChoice(l):
    return l[random.randint(0, len(l)-1)]


def randomTrainingSample(samples: dict):
    label = randomChoice(CLASSES)
    sample = randomChoice(samples[label])
    label_tensor  = torch.tensor([label])
    sample_tensor = textToTensor(sample) 
    return label, sample, label_tensor, sample_tensor

In [293]:
for i in range(3):
    category, line, category_tensor, line_tensor = randomTrainingSample(train_samples)
    print('category =', category, '/ line =', line[:4])
    # print(line_tensor)

category = 1 / line = [53, 1584, 2135, 1784]
category = 0 / line = [1564, 1661, 2193, 1745]
category = 0 / line = [2600, 3057, 1578, 1528]


In [None]:
import time
import math


current_loss = 0
all_losses = []

rnn = RNNClassifier(NUM_WORDS, HIDDEN_DIM, NUM_CLASSES)
criterion = nn.CrossEntropyLoss()

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)
start = time.time()

# training algorithm, which takes one instance and performs single SGD update
def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()
    # key step: unroll the RNN over each symbol in the input sequence
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    # treat the last output as the prediction of the category label
    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-LEARNING_RATE)
    return output, loss.item()

for iter in range(1, NUM_ITERS + 1):
    category, line, category_tensor, line_tensor = randomTrainingSample(train_samples)
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % PLOT_EVERY == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s' % (iter, iter / NUM_ITERS * 100, timeSince(start), loss, guess, correct))

    # Add current loss avg to list of losses
    if iter % PLOT_EVERY == 0:
        all_losses.append(current_loss / PLOT_EVERY)
        current_loss = 0

500 3% (0m 12s) 0.7382 1 / ✗ (0)
1000 6% (0m 25s) 0.7137 1 / ✗ (0)
1500 10% (0m 37s) 0.7273 1 / ✗ (0)
2000 13% (0m 50s) 0.6612 1 / ✓
2500 16% (1m 3s) 0.6809 1 / ✓
3000 20% (1m 16s) 0.7139 1 / ✗ (0)
3500 23% (1m 28s) 0.6758 1 / ✓
4000 26% (1m 41s) 0.6736 1 / ✓
4500 30% (1m 54s) 0.7067 1 / ✗ (0)
5000 33% (2m 6s) 0.6618 1 / ✓
5500 36% (2m 19s) 0.7174 1 / ✗ (0)
6000 40% (2m 32s) 0.7100 1 / ✗ (0)
6500 43% (2m 45s) 0.6775 1 / ✓
7000 46% (2m 59s) 0.6912 0 / ✓
7500 50% (3m 13s) 0.6997 0 / ✗ (1)
8000 53% (3m 26s) 0.6893 0 / ✓
8500 56% (3m 39s) 0.7084 0 / ✗ (1)
9000 60% (3m 52s) 0.6742 0 / ✓
9500 63% (4m 7s) 0.6739 0 / ✓
10000 66% (4m 22s) 0.6981 0 / ✗ (1)
10500 70% (4m 36s) 0.6873 1 / ✓
11000 73% (4m 50s) 0.7053 0 / ✗ (1)
11500 76% (5m 4s) 0.6883 0 / ✓
12000 80% (5m 18s) 0.7082 0 / ✗ (1)
12500 83% (5m 33s) 0.6789 0 / ✓


In [None]:
plt.figure()
plt.xlabel('Iterations')
plt.ylabel('Loss (Train)')
plt.plot(range(0,NUM_ITERS,PLOT_EVERY),all_losses)

## Evaluation

we now evaluate the trained models performance on the validation set

In [248]:
def prediction(sample_tensor):
    if type(sample_tensor) == list: 
        sample_tensor = textToTensor(sample_tensor)
        
    hidden = rnn.initHidden()
    
    for i in range(sample_tensor.size()[0]):
        output, hidden = rnn(sample_tensor[i], hidden)
    
    classification = torch.argmax(output).item()
    return classification

In [249]:
def evaluate_model(validation_samples):
    # performs model validation using validation samples
    
    pos, neg  = validation_samples[1], validation_samples[0]
    samples   = pos+neg
    n_samples = len(samples)
    labels    = np.concatenate([np.ones(len(pos)), np.zeros(len(neg))])
    preds     = np.zeros(n_samples)
    
    for i in tqdm(range(n_samples)):
        sample = samples[i]
        pred   = prediction(sample)
        preds[i]  = pred
    
    print(f"f1 : {f1_score(labels, preds)}")
    print(f"recall : {recall_score(labels, preds)}")
    print(f"precision: {precision_score(labels, preds)}")
    return

In [250]:
evaluate_model(val_samples)

100%|██████████| 2207/2207 [00:25<00:00, 86.40it/s] 

f1 : 0.04128440366972477
recall : 0.9782608695652174
precision: 0.021087160262417994





**Training - Test Split & Resampling**

as we are building 100 classifiers, one for each author `0, ..., 99` we use the following function to build data sets for each author.

In [11]:
RANDOM_STATE = 42069

def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def downsample_training(X_train, y_train):
    """
    downasamples majority class until class balance is achieved 
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    neg_downsample = resample(neg, replace=True, n_samples=len(pos), random_state=RANDOM_STATE)
    
    resampled = pd.concat([pos, neg_downsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def resample_training(X_train, y_train):
    """
    resamples class imbalance using SMOTE: 
    https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
    """
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train