This first model buidls classifiers for each of the prolific authors with `id` equal to `0,...,99` by using a 'vectorised' representation of the text. 

The idea behind this model is to capture the vocabularly of each author, and the resulting high dimensional feature space should result in near-linear separability. 

In [1]:
import json
import csv
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from typing import List

In [2]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [3]:
# preprocessing

def pre_processing(df: pd.DataFrame, train=True):
    """
    performs initial preprocessing to base data frame
    """
    if train:
        # preprocessing for authors
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # preprocessing for text
    df["abstract"] = df["abstract"].apply(lambda x: text_to_vector(x))
    df["title"]    = df["title"].apply(lambda x: text_to_vector(x))
    df["text"]     = df["title"] + df["abstract"]
    
    # dropping irrelivent columns
    df = df.drop(["abstract", "title"], axis=1)
    return df



In [4]:
# Feature transformations

def filter_authors(authors: List[int], prolifics=True):
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)
    
    
def text_to_vector(text: List[int]):
    """
    Converts text to sparse matrix representation
    text: List of integers between 1, 4999
    """
    word_vec = np.zeros(5000, dtype=int)
    for word in text:
        word_vec[word] += 1
    return word_vec

In [5]:
# training

def train_classifier(author: int, df: pd.DataFrame, debug=False):
    """
    Trains a classifier for author i. Assumes text-vectorisaiton has occured.
    
    Model Features:
    text vectorisation
    """
    # create copy and set up label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    # split up positive and negative instances so as to ensure a balanced training set 
    # if we don't do this, we end up with a very imbalanced training set 
    # however, if we don't include enough negative samples, we tend to "overclassify". 
    # we can tune out performance with the below 'neg sample factor'
    
    neg_sample_factor = 10
    
    pos = df[df['label'] == 1] 
    neg = df[df['label'] == 0]
    
    n_pos_samples = pos.shape[0]
    n_tot_samples = df.shape[0]
    
    # takes a sample of the negative instances to train on
    neg = neg.sample(frac=neg_sample_factor*(n_pos_samples/n_tot_samples)) 
    
    if debug:
        print(f"training on {pos.shape[0]} postitive instances")
        print(f"training on {neg.shape[0]} negative  instances")
    
    df = pd.concat([pos, neg])
    X_train = pd.DataFrame(df.text.tolist(), index= df.index)
    y_train = df["label"]
    
    if debug:
        print(f"training on {X_train.shape[0]} instances")
    
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    
    if debug:
        y_train_pred = clf.predict(X_train) 
        acc = accuracy_score(y_train, y_train_pred) 
        f1  = f1_score(y_train, y_train_pred)
        print(f"Accuracy: {acc}")
        print(f"f1 score: {f1}")

    return clf

**Training**

In [6]:
path = "../data/train.json"
df = load_data_set(path)
df = pre_processing(df)

loaded 25793 instances


In [7]:
author = 42
model  = train_classifier(author, df)

In [8]:
authors = np.arange(0, 100)
models  = []
for i in tqdm(authors):
    model = train_classifier(i, df)
    models.append(model)

100%|██████████| 100/100 [27:58<00:00, 16.79s/it]  


**Model Validation**

In [9]:
def validate_model(author: int, df: pd.DataFrame, classifier):
    # simple function to assess model performance
    
    # create copy and set up label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    # split up positive and negative instances so as to ensure a balanced training set 
    # if we don't do this, we end up with a very imbalanced training set 
    pos = df[df['label'] == 1] 
    neg = df[df['label'] == 0]
    
    # takes a sample of the instances to test on
    pos = pos.sample(frac=(1/2))
    neg = neg.sample(frac=(1/10))
    
    # recombine 
    df = pd.concat([pos, neg])
    X_test = pd.DataFrame(df.text.tolist(), index= df.index)
    y_test = df["label"]
    
    # perform predictions 
    y_pred = classifier.predict(X_test)
    
    acc = accuracy_score(y_pred, y_test) 
    f1  = f1_score(y_pred, y_test) 
    print(f"Accuracy: {acc}")
    print(f"f1 score: {f1}")
    return

**Build Predictions**

In [10]:
path = "../data/test.json"
df_test = load_data_set(path)
df_test = pre_processing(df_test, train=False)

loaded 800 instances


In [22]:
def make_predictions(test_df: pd.DataFrame):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("predictions.csv"):
        os.remove("predictions.csv")
        print("removed previous predictions")
    
    
    with open("predictions.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predicted']
        writer.writerow(header)
        
        
        X_test = pd.DataFrame(test_df.text.tolist(), index=test_df.index)
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            x   = np.array(X_test.iloc[Id]).reshape(1, -1)
            row = [Id]
            authors = []
            for author, model in enumerate(models):
                if np.array(model.predict(x)).item() == 1:
                    authors.append(author)

            # to match the output requirement 
            if len(authors) == 0: row.append(-1)
            else: row += authors
            
            writer.writerow(row)
    return

In [23]:
make_predictions(df_test)

removed previous predictions


100%|██████████| 800/800 [00:02<00:00, 279.45it/s]
