The first model builds classifiers for each of the prolific authors with `id` equal to `0,...,99` by using a 'vectorised' representation of the text. 

This second model builds the feature space further, by using venue as a secondary feature. 

We also formalise a training / validation split between our data sets.

The idea behind this model is to capture the vocabularly of each author, and the resulting high dimensional feature space should result in near-linear separability. 

In [1]:
import json
import csv
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from typing import List

RANDOM_STATE = 69

In [2]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [3]:
# preprocessing

def pre_processing(df: pd.DataFrame, train=True):
    """
    performs initial preprocessing to base data frame
    drop_blanks: drop instances with no target authors. reduces training set by ~60%
    """
    # preprocessing for authors
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors_lst"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df["has target"]     = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["authors", "has target"], axis=1)
    
    elif train == False:
        df["coauthors_lst"]  = df["coauthors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["coauthors"], axis=1)
    
    # preprocessing for text - replace the text with the word count. 
    df["text"]     = df["title"].apply(lambda x: len(x)) + df["abstract"].apply(lambda x: len(x))
    #text_df = pd.DataFrame(df.text.tolist(), index=df.index, columns=[str(i) for i in range(5000)])
    
    # preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))
    
    # prepocessing for coauthors
    # we use a discretised binning strategy, with n=10 bins by default. 
    df["coauthors"] = df["coauthors_lst"].apply(lambda x: len(x))

    #coauth_df = pd.DataFrame(df.coauthors.tolist(), index=df.index, columns=["bin "+str(i) for i in range(10)])
    
    # dropping irrelivent columns & concat with 5000-column text_df
    df = df.drop(["abstract", "title", "coauthors_lst"], axis=1)
    
    # and drop row identifier if test set
    if not train:
        df = df.drop(["identifier"], axis=1)

    return df

In [4]:
# Feature transformations

def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)

In [5]:
path = "train.json"
df = load_data_set(path)
df = pre_processing(df)
df.head()

loaded 25793 instances


Unnamed: 0,year,venue,target authors,text,coauthors
0,9,0.043011,"[42, 36]",103,1
1,15,0.004301,[45],123,2
3,10,0.008602,[97],119,0
4,10,0.019355,[2],148,1
9,18,0.0,"[44, 2]",145,2


In [6]:
df.head()

Unnamed: 0,year,venue,target authors,text,coauthors
0,9,0.043011,"[42, 36]",103,1
1,15,0.004301,[45],123,2
3,10,0.008602,[97],119,0
4,10,0.019355,[2],148,1
9,18,0.0,"[44, 2]",145,2


**Model Validation**

In [7]:
# resampling techiques to address label imbalance

def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def downsample_training(X_train, y_train):
    """
    downasamples majority class until class balance is achieved 
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    neg_downsample = resample(neg, replace=True, n_samples=len(pos), random_state=RANDOM_STATE)
    
    resampled = pd.concat([pos, neg_downsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def resample_training(X_train, y_train):
    """
    resamples class imbalance using SMOTE: 
    https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
    """
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train
    
    

In [8]:
def build_evaluate_classifier(author: int, df:pd.DataFrame):
    
    
    # take copy and prepare label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    X = df.drop(["label", "target authors"], axis=1)
    y = df["label"]
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)
    
    # upsample to deal with class imbalance
    X_train, y_train = resample_training(X_train, y_train)
    
    # fit to model
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    
    # validatite model
    y_pred = clf.predict(X_val)
    f1 = f1_score(y_pred, y_val)
    precision = precision_score(y_pred, y_val)
    recall = recall_score(y_pred, y_val)
    return f1, precision, recall

In [9]:
def validate_to_csv(df: pd.DataFrame):
    """
    As we are building 100 classifiers, printing f1 scores within a notebook is impractical. 
    following function writes results to csv. 
    """
    
    with open("Model2NoText.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Author Id','F1 score', 'Precision', 'Recall']
        writer.writerow(header)
        
        # loop over each author, build classifier and write to output
        authors = np.arange(100)
        avg_f1, avg_recall, avg_precision = 0, 0, 0
        
        for author in tqdm(authors):
            f1, precision, recall = build_evaluate_classifier(author, df)
            writer.writerow([author, f1])
            avg_f1 += f1
            avg_precision += precision
            avg_recall += recall 
        
        writer.writerow(["average", avg_f1/100])
            
    print(f"average f1:        {avg_f1/100}")
    print(f"average recall:    {avg_recall/100}")
    print(f"average precision: {avg_precision/100}")
    return

In [10]:
# perform model validation checking
validate_to_csv(df)

100%|██████████| 100/100 [00:04<00:00, 22.73it/s]

average f1:        0.03627721551660119
average recall:    0.01882055524054555
average precision: 0.5967826391204196





In [11]:
df.head()


Unnamed: 0,year,venue,target authors,text,coauthors
0,9,0.043011,"[42, 36]",103,1
1,15,0.004301,[45],123,2
3,10,0.008602,[97],119,0
4,10,0.019355,[2],148,1
9,18,0.0,"[44, 2]",145,2


In [12]:
df.dtypes

year                int64
venue             float64
target authors     object
text                int64
coauthors           int64
dtype: object

**Training**

In [13]:
# training

def train_classifier(author: int, df: pd.DataFrame, debug=False):
    """
    Trains a classifier for author i. Assumes text-vectorisaiton has occured.
    
    Model Features:
    text vectorisation
    """
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    X_train = df.drop(["label", "target authors"], axis=1)
    y_train = df["label"]
    
    # upsample to deal with class imbalance
    X_train, y_train = upsample_training(X_train, y_train)
    
    # fit to model
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    

    return clf

In [14]:
df.head()

Unnamed: 0,year,venue,target authors,text,coauthors
0,9,0.043011,"[42, 36]",103,1
1,15,0.004301,[45],123,2
3,10,0.008602,[97],119,0
4,10,0.019355,[2],148,1
9,18,0.0,"[44, 2]",145,2


In [15]:
# build classifiers for each author
authors = np.arange(0, 100)
models  = []
for i in tqdm(authors):
    model = train_classifier(i, df)
    models.append(model)

100%|██████████| 100/100 [00:04<00:00, 21.04it/s]


**Build Predictions**

In [16]:
# load in test data
path = "test.json"
df_test = load_data_set(path)
df_test = pre_processing(df_test, train=False)

loaded 800 instances


In [17]:
df_test.head()

Unnamed: 0,year,venue,text,coauthors
0,19,0.47957,177,4
1,19,0.47957,283,2
2,19,0.015054,246,6
3,19,0.045161,104,4
4,19,1.0,186,23


In [18]:
def make_predictions(test_df: pd.DataFrame):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("predictionsNoText.csv"):
        os.remove("predictionsNoText.csv")
        print("removed previous predictions")
    
    
    with open("predictionsNoText.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predict']
        writer.writerow(header)
        
        
        X_test = test_df
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            x   = test_df.iloc[Id].to_frame().T
            authors = ""
            for author, model in enumerate(models):
                if np.array(model.predict(x)).item() == 1:
                    authors += str(author) + " "

            # to match the output requirement 
            if len(authors) == 0: row = [Id, -1]
            else: row = [Id, authors]

            writer.writerow(row)
    return

In [19]:
make_predictions(df_test)

removed previous predictions


100%|██████████| 800/800 [00:53<00:00, 14.84it/s]
