The first model builds classifiers for each of the prolific authors with `id` equal to `0,...,99` by using a 'vectorised' representation of the text. 

This second model builds the feature space further, by using venue as a secondary feature. 

We also formalise a training / validation split between our data sets.

The idea behind this model is to capture the vocabularly of each author, and the resulting high dimensional feature space should result in near-linear separability. 

In [1]:
import json
import csv
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC 
from typing import List

RANDOM_STATE = 69

In [2]:
import warnings
warnings.simplefilter(action='ignore')
import xgboost as xgb


In [3]:
#

In [4]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [5]:
# preprocessing

def pre_processing(df: pd.DataFrame, train=True):
    """
    performs initial preprocessing to base data frame
    drop_blanks: drop instances with no target authors. reduces training set by ~60%
    """
    # preprocessing for authors
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df["has target"]     = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["authors", "has target"], axis=1)
    
    # preprocessing for text - expend text out over separate columns
    df["abstract"] = df["abstract"].apply(lambda x: text_to_vector(x))
    df["title"]    = df["title"].apply(lambda x: text_to_vector(x))
    df["text"]     = df["title"] + df["abstract"]
    text_df = pd.DataFrame(df.text.tolist(), index=df.index, columns=[str(i) for i in range(5000)])
    
    # preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))
    
    # prepocessing for coauthors
    # we use a discretised binning strategy, with n=10 bins by default. 
    df["coauthors"] = df["coauthors"].apply(lambda x: build_bins(x, n_bins=10))
    coauth_df = pd.DataFrame(df.coauthors.tolist(), index=df.index, columns=["bin "+str(i) for i in range(10)])
    
    # dropping irrelivent columns & concat with 5000-column text_df
    df = df.drop(["abstract", "title", "text", "year", "coauthors"], axis=1)
    df = pd.concat([df, text_df, coauth_df], axis=1)
    
    # and drop row identifier if test set
    if not train:
        df = df.drop(["identifier"], axis=1)

    return df

In [6]:
# Feature transformations

def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)
    
    
def text_to_vector(text: List[int]):
    """
    Converts text to sparse matrix representation
    text: List of integers between 1, 4999
    """
    word_vec = np.zeros(5000, dtype=int)
    for word in text:
        word_vec[word] += 1
    return word_vec


def build_bins(coauthors: List[int], n_bins=10):
    """
    takes a list of coauthors and returns 10-column data frame
    
    This might be some of the uggliest code I have ever written, though
    sklearn's discrete bins didn't really give what I wanted
    """
    width = np.ceil(21246/n_bins)
    bins  = np.zeros(n_bins)
    for author in coauthors:
        i = 0
        while not (max(0,(i-1))*width <= author <= i*width):
            i += 1
        bins[i-1] += 1
    return bins

In [7]:
path = "train.json"
df = load_data_set(path)
df = pre_processing(df)
df.head()

**Model Validation**

In [None]:
# resampling techiques to address label imbalance

def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def downsample_training(X_train, y_train):
    """
    downasamples majority class until class balance is achieved 
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    neg_downsample = resample(neg, replace=True, n_samples=len(pos), random_state=RANDOM_STATE)
    
    resampled = pd.concat([pos, neg_downsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def resample_training(X_train, y_train):
    """
    resamples class imbalance using SMOTE: 
    https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
    """
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train
    
    

In [None]:
def build_evaluate_classifier(author: int, df:pd.DataFrame):
    
    
    # take copy and prepare label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    X = df.drop(["label", "target authors"], axis=1)
    y = df["label"]
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)
    
    # upsample to deal with class imbalance
    X_train, y_train = resample_training(X_train, y_train)
    
    # fit to model
    #clf = LogisticRegression(max_iter=1000)
    #clf = LinearSVC(max_iter=1000)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_train)#, label=y_train)
    #param
    param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
    num_round=2
    bst = xgb.train(param, dtrain, num_round)
    #clf.fit(X_train, y_train)
    
    # validatite model
    y_pred = bst.predict(dtest)
    #y_pred = clf.predict(X_val)
    print(y_pred)
    f1 = f1_score(y_pred, y_val)
    precision = precision_score(y_pred, y_val)
    recall = recall_score(y_pred, y_val)
    return f1, precision, recall

In [None]:
def validate_to_csv(df: pd.DataFrame):
    """
    As we are building 100 classifiers, printing f1 scores within a notebook is impractical. 
    following function writes results to csv. 
    """
    
    with open("validation.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Author Id','F1 score']
        writer.writerow(header)
        
        # loop over each author, build classifier and write to output
        authors = np.arange(100)
        
        for author in tqdm(authors):
            f1, precision, recall = build_evaluate_classifier(author, df)
            writer.writerow([author, f1, precision, recall])
    return

In [None]:
def validate_to_csv(df: pd.DataFrame):
    """
    As we are building 100 classifiers, printing f1 scores within a notebook is impractical. 
    following function writes results to csv. 
    """
    
    with open("Model 2 validation - resample.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Author Id','F1 score', 'Precision', 'Recall']
        writer.writerow(header)
        
        # loop over each author, build classifier and write to output
        authors = np.arange(100)
        avg_f1, avg_recall, avg_precision = 0, 0, 0
        
        for author in tqdm(authors):
            f1, precision, recall = build_evaluate_classifier(author, df)
            writer.writerow([author, f1, precision, recall])
            avg_f1 += f1
            avg_precision += precision
            avg_recall += recall 
            
    print(f"average f1:        {avg_f1/100}")
    print(f"average recall:    {avg_recall/100}")
    print(f"average precision: {avg_precision/100}")
    return

In [None]:
# perform model validation checking
validate_to_csv(df)

  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:07<?, ?it/s]


ValueError: Found input variables with inconsistent numbers of samples: [10272, 2238]

**Training**

In [None]:
# training

def train_classifier(author: int, df: pd.DataFrame, debug=False):
    """
    Trains a classifier for author i. Assumes text-vectorisaiton has occured.
    
    Model Features:
    text vectorisation
    """
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    X_train = df.drop(["label", "target authors"], axis=1)
    y_train = df["label"]
    
    # upsample to deal with class imbalance
    X_train, y_train = upsample_training(X_train, y_train)
    
    # fit to model
    #clf = LogisticRegression(max_iter=1000)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_train)#, label=y_train)
    #param
    param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
    num_round=2
    bst = xgb.train(param, dtrain, num_round)
    #clf.fit(X_train, y_train)
    
    # validatite model
    #y_pred = bst.predict(dtest)
    #clf = LinearSVC(max_iter=1000) #LogisticRegression(max_iter=1000)
    #clf.fit(X_train, y_train)
    

    return bst

In [None]:
path = "train.json"
df = load_data_set(path)
df = pre_processing(df)

loaded 25793 instances


In [None]:
df.head()

Unnamed: 0,venue,target authors,0,1,2,3,4,5,6,7,...,bin 0,bin 1,bin 2,bin 3,bin 4,bin 5,bin 6,bin 7,bin 8,bin 9
0,0.043011,"[42, 36]",0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.004301,[45],0,0,0,0,1,0,2,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.008602,[97],0,0,0,0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.019355,[2],0,0,0,0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,0.0,"[44, 2]",0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# build classifiers for each author
authors = np.arange(0, 100)
models  = []
for i in tqdm(authors):
    model = train_classifier(i, df)
    models.append(model)

100%|██████████| 100/100 [10:59<00:00,  6.60s/it]


**Build Predictions**

In [None]:
# load in test data
path = "test.json"
df_test = load_data_set(path)
df_test = pre_processing(df_test, train=False)

loaded 800 instances


In [None]:
df_test.head()

Unnamed: 0,venue,0,1,2,3,4,5,6,7,8,...,bin 0,bin 1,bin 2,bin 3,bin 4,bin 5,bin 6,bin 7,bin 8,bin 9
0,0.47957,0,0,0,0,0,0,1,1,0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.47957,0,0,0,0,0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.015054,0,0,0,0,0,0,1,1,0,...,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.045161,0,0,0,0,0,0,1,1,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1.0,0,0,0,0,0,0,0,0,0,...,0.0,4.0,0.0,1.0,4.0,4.0,2.0,4.0,2.0,2.0


In [None]:
def make_predictions(test_df: pd.DataFrame):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("predictions.csv"):
        os.remove("predictions.csv")
        print("removed previous predictions")
    
    
    with open("predictions.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predict']
        writer.writerow(header)
        
        
        X_test = test_df
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            x   = xgb.DMatrix(np.array(X_test.iloc[Id]).reshape(1, -1))

            authors = ""
            for author, model in enumerate(models):
                if np.array(model.predict(x)).item() == 1:
                    authors += str(author) + " "

            # to match the output requirement 
            if len(authors) == 0: row = [Id, -1]
            else: row = [Id, authors]

            writer.writerow(row)
    return

In [None]:
make_predictions(df_test)

removed previous predictions


100%|██████████| 800/800 [00:08<00:00, 90.83it/s]
