The first model builds classifiers for each of the prolific authors with `id` equal to `0,...,99` by using a 'vectorised' representation of the text. 

This second model builds the feature space further, by using venue as a secondary feature. 

We also formalise a training / validation split between our data sets.

The idea behind this model is to capture the vocabularly of each author, and the resulting high dimensional feature space should result in near-linear separability. 

In [1]:
import json
import csv
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from typing import List

In [2]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [3]:
# preprocessing

def pre_processing(df: pd.DataFrame, train=True):
    """
    performs initial preprocessing to base data frame
    """
    # preprocessing for authors
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # preprocessing for text - expend text out over separate columns
    df["abstract"] = df["abstract"].apply(lambda x: text_to_vector(x))
    df["title"]    = df["title"].apply(lambda x: text_to_vector(x))
    df["text"]     = df["title"] + df["abstract"]
    text_df = pd.DataFrame(df.text.tolist(), index=df.index)
    
    # preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))
    
    # prepocessing for coauthors
    # we use a discretised binning strategy, with n=10 bins by default. 
    df["coauthors"] = df["coauthors"].apply(lambda x: build_bins(x, n_bins=10))
    coauth_df = pd.DataFrame(df.coauthors.tolist(), index=df.index)
    
    # dropping irrelivent columns & concat with 5000-column text_df
    df = df.drop(["abstract", "title", "text", "year", "coauthors"], axis=1)
    df = pd.concat([df, text_df, coauth_df], axis=1)
    
    # and drop row identifier if test set
    if not train:
        df = df.drop(["identifier"], axis=1)

    return df

In [4]:
# Feature transformations

def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)
    
    
def text_to_vector(text: List[int]):
    """
    Converts text to sparse matrix representation
    text: List of integers between 1, 4999
    """
    word_vec = np.zeros(5000, dtype=int)
    for word in text:
        word_vec[word] += 1
    return word_vec


def build_bins(coauthors: List[int], n_bins=10):
    """
    takes a list of coauthors and returns 10-column data frame
    
    This might be some of the uggliest code I have ever written, though
    sklearn's discrete bins didn't really give what I wanted
    """
    width = np.ceil(21246/n_bins)
    bins  = np.zeros(n_bins)
    for author in coauthors:
        i = 0
        while not (max(0,(i-1))*width <= author <= i*width):
            i += 1
        bins[i-1] += 1
    return bins

In [5]:
# training

def train_classifier(author: int, df: pd.DataFrame, debug=False):
    """
    Trains a classifier for author i. Assumes text-vectorisaiton has occured.
    
    Model Features:
    text vectorisation
    """
    # create copy and set up label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    df = df.drop(["target authors"], axis=1)
    
    # split up positive and negative instances so as to ensure a balanced training set 
    # if we don't do this, we end up with a very imbalanced training set 
    # however, if we don't include enough negative samples, we tend to "overclassify". 
    # we can tune out performance with the below 'neg sample factor'
    
    neg_sample_factor = 10
    
    pos = df[df['label'] == 1] 
    neg = df[df['label'] == 0]
    
    n_pos_samples = pos.shape[0]
    n_tot_samples = df.shape[0]
    
    # takes a sample of the negative instances to train on
    neg = neg.sample(frac=neg_sample_factor*(n_pos_samples/n_tot_samples)) 
    
    if debug:
        print(f"training on {pos.shape[0]} postitive instances")
        print(f"training on {neg.shape[0]} negative  instances")
    
    df = pd.concat([pos, neg])
    X_train = df.loc[:, df.columns != "label"]
    y_train = df["label"]
    
    if debug:
        print(f"training on {X_train.shape[0]} instances")
    
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    
    if debug:
        y_train_pred = clf.predict(X_train) 
        acc = accuracy_score(y_train, y_train_pred) 
        f1  = f1_score(y_train, y_train_pred)
        print(f"Accuracy: {acc}")
        print(f"f1 score: {f1}")

    return clf

**Training**

In [6]:
path = "../data/train.json"
df = load_data_set(path)
df = pre_processing(df)

loaded 25793 instances


In [7]:
df.head()

Unnamed: 0,venue,target authors,0,1,2,3,4,5,6,7,...,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8,9
0,0.043011,"[42, 36]",0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.004301,[45],0,0,0,0,1,0,2,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,[],0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.008602,[97],0,0,0,0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.019355,[2],0,0,0,0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
authors = np.arange(0, 100)
models  = []
for i in tqdm(authors):
    model = train_classifier(i, df)
    models.append(model)

100%|██████████| 100/100 [05:01<00:00,  3.02s/it]


**Model Validation**

In [9]:
def validate_model(author: int, df: pd.DataFrame, classifier):
    # simple function to assess model performance
    
    # create copy and set up label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    # split up positive and negative instances so as to ensure a balanced training set 
    # if we don't do this, we end up with a very imbalanced training set 
    pos = df[df['label'] == 1] 
    neg = df[df['label'] == 0]
    
    # takes a sample of the instances to test on
    pos = pos.sample(frac=(1/2))
    neg = neg.sample(frac=(1/10))
    
    # recombine 
    df = pd.concat([pos, neg])
    X_test = pd.DataFrame(df.text.tolist(), index= df.index)
    y_test = df["label"]
    
    # perform predictions 
    y_pred = classifier.predict(X_test)
    
    acc = accuracy_score(y_pred, y_test) 
    f1  = f1_score(y_pred, y_test) 
    print(f"Accuracy: {acc}")
    print(f"f1 score: {f1}")
    return

**Build Predictions**

In [10]:
path = "../data/test.json"
df_test = load_data_set(path)
df_test = pre_processing(df_test, train=False)

loaded 800 instances


In [11]:
df_test.head()

Unnamed: 0,venue,0,1,2,3,4,5,6,7,8,...,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9
0,0.47957,0,0,0,0,0,0,1,1,0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.47957,0,0,0,0,0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.015054,0,0,0,0,0,0,1,1,0,...,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.045161,0,0,0,0,0,0,1,1,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1.0,0,0,0,0,0,0,0,0,0,...,0.0,4.0,0.0,1.0,4.0,4.0,2.0,4.0,2.0,2.0


In [14]:
def make_predictions(test_df: pd.DataFrame):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("predictions.csv"):
        os.remove("predictions.csv")
        print("removed previous predictions")
    
    
    with open("predictions.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predicted']
        writer.writerow(header)
        
        
        X_test = test_df
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            x   = np.array(X_test.iloc[Id]).reshape(1, -1)
            row = [Id]
            authors = []
            for author, model in enumerate(models):
                if np.array(model.predict(x)).item() == 1:
                    authors.append(author)

            # to match the output requirement 
            if len(authors) == 0: row.append(-1)
            else: row += authors
            
            writer.writerow(row)
    return

In [15]:
make_predictions(df_test)

  2%|▏         | 13/800 [00:00<00:06, 122.69it/s]

removed previous predictions


100%|██████████| 800/800 [00:05<00:00, 149.94it/s]
