In [1]:
import json
import os
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from typing import List

NUM_WORDS = 5000
NUM_AUTHORS = 21246
MAX_LEN = 250
RANDOM_STATE = 42069

In [2]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [3]:
path = "../data/train.json"
train = load_data_set(path)
train.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


## Preprocessing

In [8]:
def preprocess(df: pd.DataFrame, train=True, drop_samples=False):
    
    df = df.copy(deep=True)
   
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # drops samples containing no prolific authors, Reduces training set by ~60% to 7000 samples
    if drop_samples:
        df["has target"] = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["has target"], axis=1)
        
    # text transormation
    # we stringify the list of int's to be used as inputs to the TF-IDF vectoriser
    df["text"] = df["title"] + df["abstract"]
    df["str text"] = df["text"].apply(lambda xs: ''.join(str(x)+' ' for x in xs))
    
    # preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))

    # drop
    df = df.drop(["abstract", "title", "year"], axis=1)
    return df

In [9]:
def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)

In [16]:
df = preprocess(train, train=True, drop_samples=True)
df.head()

Unnamed: 0,venue,target authors,coauthors,text,str text
0,0.043011,"[42, 36]",[13720],"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1...",41 1550 1563 1594 1544 1919 1644 37 1539 1715 ...
1,0.004301,[45],"[1359, 15881]","[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3...",1731 47 11 57 4624 1525 1535 47 11 3522 2223 1...
3,0.008602,[97],[],"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5...",40 1733 1735 1540 1655 46 1624 1547 56 1687 16...
4,0.019355,[2],[19617],"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,...",38 1592 2088 1543 1574 1727 1597 1813 1926 152...
9,0.0,"[44, 2]","[9641, 5623]","[1560, 1694, 11, 1546, 11, 3066, 1728, 47, 160...",1560 1694 11 1546 11 3066 1728 47 1603 1553 11...


## TF-IDF Preprocessing

TF-IDF is a nlp preprocessing method to map text input to a vector of reals. TF-IDF is an improvement upon previous feature engineering that we have performed as it adjusts the value of each word, relative to how freuently it occurs. Stop words such as *the, it, how* have relatively low weightings, so the resulting vector only captures the most *important* words within the input. This (typically) leads to TF-IDF representations outperforming word-count representations for most tasks

We use `sklearn`'s feature extraction to automate this process. 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf   = TfidfVectorizer()
vectors = tfidf.fit_transform(df["str text"])

In [13]:
feature_names = tfidf.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_text = pd.DataFrame(denselist, columns=feature_names)
df_text.head()

Unnamed: 0,10,100,1005,1006,1007,1009,101,1014,1016,1022,...,962,965,968,970,973,977,98,980,987,998
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.090374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.018773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.130861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df_auth_venue = df[["target authors", "venue", "coauthors"]]
df_full       = pd.concat([df_text.reset_index(drop=True), df_auth_venue.reset_index(drop=True)], axis=1)
df_full.head()

Unnamed: 0,10,100,1005,1006,1007,1009,101,1014,1016,1022,...,970,973,977,98,980,987,998,target authors,venue,coauthors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[42, 36]",0.043011,[13720]
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[45],0.004301,"[1359, 15881]"
2,0.090374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[97],0.008602,[]
3,0.018773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[2],0.019355,[19617]
4,0.130861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[44, 2]",0.0,"[9641, 5623]"


## Author Preprocessing

We previously used a binning strategy of dealing with coauthors. This time, we instead use a simple count of how many coauthors (with `id>100`) they have previously colaborated with appear in the paper. To do this, we build a dictionary of lists of coauthors for each prolific author.

In [48]:
def construct_collaborator_dictionary(df: pd.DataFrame):
    """
    Constructs a database of collobaroter for given author id key. 
    """
    
    collaboraters = {}
    authors       = np.arange(100)
    
    for author in authors:
        df_auth = df.copy(deep=True)
        df_auth["label"] = df_auth["target authors"].apply(lambda x: 1 if author in x else 0)
        df_auth = df_auth[df_auth["label"] == 1]
        coauths = list(set(df_auth["coauthors"].sum()))
        collaboraters[author] = coauths
    
    return collaboraters

In [52]:
collaborator_db = construct_collaborator_dictionary(df_full)

## Model Validation

We now build and validate a model. We use a `RANDOM_STATE` seed to ensure we generate the same training/evaluation split

In [53]:
def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def resample_training(X_train, y_train):
    """
    resamples class imbalance using SMOTE: 
    https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
    """
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train

In [67]:
def get_train_val_data(author: int, df:pd.DataFrame):
    # take copy and prepare label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    # map number of collaborators for this given instance 
    collabs = collaborator_db[author]
    df["num collaborators"] = df["coauthors"].apply(lambda x: len(set(x).intersection(collabs)))
    
    # drop irrelevant columns
    X = df.drop(["label", "target authors", "coauthors"], axis=1)
    y = df["label"]
    
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=RANDOM_STATE)
    
    # upsample to deal with class imbalance
    X_train, y_train = upsample_training(X_train, y_train)
    
    # we convert to numpy arrays for fitting to sklearn models
    # the reason for this is that sklearn throws annoying warnings otherwise
    return np.array(X_train), np.array(X_val), np.array(y_train), np.array(y_val)

In [76]:
def validate_to_csv(df: pd.DataFrame):
    """
    As we are building 100 classifiers, printing f1 scores within a notebook is impractical. 
    following function writes results to csv. 
    """
    
    with open("validation.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Author Id','F1 score']
        writer.writerow(header)
        
        # loop over each author, build classifier and write to output
        authors = np.arange(100)
        
        avg = 0
        for author in tqdm(authors):
            X_train, X_val, y_train, y_val = get_train_val_data(author, df)
            
            clf = LogisticRegression()
            clf.fit(X_train, y_train)
            
            y_pred = clf.predict(X_val)
            
            f1 = f1_score(y_pred, y_val)
            avg += f1
            writer.writerow([author, f1])
        writer.writerow(["average", avg])
    return

In [77]:
validate_to_csv(data)

100%|██████████| 100/100 [06:05<00:00,  3.65s/it]


## Build Binary Classifiers

we now build the 100 binary classifers, one for each author

In [None]:
models = []
authors = np.arange(100)

for author in tqdm(authors):
    X_train, X_val, y_train, y_val = get_train_val_data(author, data)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    models.append(clf)

 13%|█▎        | 13/100 [00:48<05:35,  3.86s/it]

## Build Predictions