The previous two models have been more *classical* feature-based machine learning models. This newly proposed model takes advantage of the fact that the text encoding is informative, and better addresses the variable input size of the text with an RNN model.

The only feature for this models is the text data, which can be ensembled with other feature-based models to capture the information lost from dropping coauthors and venue. 

We also (finally) get around to properly performing training, validatation splits.

In [1]:
import json
import csv
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from typing import List
from rnn import RNN

RANDOM_STATE = 69

In [4]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [5]:
def preprocess(df: pd.DataFrame, train=True):
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df["has target"]     = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["authors", "has target"], axis=1)

   
    df["text"] = df["title"] + df["abstract"]
    df = df.drop(["year", "abstract", "title", "venue", "coauthors"], axis=1)
    return df

In [6]:
# feature transformation

def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)

**Training - Validation Split**

In [10]:
def split_train_data(author: int, df: pd.DataFrame, random_state=RANDOM_STATE):
    """
    performs the training - validation split for a given author i.
    
    WARNING: 
    sklearn ensures that there are positive and negative instances in both sets. 
    However, there is still a massive class imbalance. We may need to address this 
    by manually splitting and sampling manually so as to reduce the number of 
    negative instances
    """
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    X = df.drop(["label"], axis=1)
    y = df["label"]
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
    return X_train, X_val, y_train, y_val

In [8]:
path = "../data/train.json"
df = load_data_set(path)
df = preprocess(df)

loaded 25793 instances


In [9]:
df.head()

Unnamed: 0,target authors,text
0,"[42, 36]","[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,[45],"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
3,[97],"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,[2],"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."
9,"[44, 2]","[1560, 1694, 11, 1546, 11, 3066, 1728, 47, 160..."


**Model Training**

In [None]:
def evaluate_classifier(author: int, df:pd.DataFrame):
    
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = split_train_data(author, df)
    
    # fit to model
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    
    # validatite model
    y_pred = clf.predict(X_val)
    f1 = f1_score(y_val, y_train)
    return f1
    

In [24]:
authors = np.arange(0,100)
for author in tqdm(authors):
    X_train, X_val, y_train, y_val = split_train_data(author, df)

100%|██████████| 100/100 [00:00<00:00, 190.79it/s]
