In [7]:
import json
import os
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from typing import List

NUM_WORDS = 5000
NUM_AUTHORS = 21246
MAX_LEN = 250
RANDOM_STATE = 42069

In [8]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [9]:
path = "data/train.json"
train = load_data_set(path)
train.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


### Preprocessing

In [10]:
def preprocess(df: pd.DataFrame, train=True, drop_samples=False):
    
    df = df.copy(deep=True)
    
    # filter year
    df = df[df["year"] > 13]
    
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # drops samples containing no prolific authors, Reduces training set by ~60% to 7000 samples
    if drop_samples:
        df["has target"] = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["has target"], axis=1)
        
    # text transormation
    # we stringify the list of int's to be used as inputs to the TF-IDF vectoriser
    df["text"] = df["title"] + df["abstract"]
    df["str text"] = df["text"].apply(lambda xs: ''.join(str(x)+' ' for x in xs))
    
    # preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))

    # drop
    df = df.drop(["abstract", "title", "year"], axis=1)
    return df

In [11]:
def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)

In [12]:
df = preprocess(train, train=True, drop_samples=True)
df.head()

Unnamed: 0,venue,target authors,coauthors,text,str text
1,0.004301,[45],"[1359, 15881]","[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3...",1731 47 11 57 4624 1525 1535 47 11 3522 2223 1...
9,0.0,"[44, 2]","[9641, 5623]","[1560, 1694, 11, 1546, 11, 3066, 1728, 47, 160...",1560 1694 11 1546 11 3066 1728 47 1603 1553 11...
22,0.015054,[49],"[14738, 14099, 3438]","[45, 1559, 2179, 1623, 1621, 3390, 1538, 47, 1...",45 1559 2179 1623 1621 3390 1538 47 1574 1530 ...
40,0.021505,[95],"[511, 18249]","[37, 47, 3363, 1553, 55, 2793, 24, 1637, 1538,...",37 47 3363 1553 55 2793 24 1637 1538 1539 2712...
50,0.090323,"[9, 30]",[15587],"[37, 1740, 1529, 1589, 1629, 47, 1603, 1553, 5...",37 1740 1529 1589 1629 47 1603 1553 57 2103 24...


### TD-IDF Preprocessing

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf   = TfidfVectorizer()
vectors = tfidf.fit_transform(df["str text"])

In [14]:
feature_names = tfidf.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_text = pd.DataFrame(denselist, columns=feature_names)
df_text.head()

Unnamed: 0,10,100,1005,1007,1016,1027,103,1033,1034,1044,...,952,953,954,962,965,970,977,98,980,987
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.13284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.188034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.065414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.052196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_auth_venue = df[["target authors", "venue", "coauthors"]]
df_full       = pd.concat([df_text.reset_index(drop=True), df_auth_venue.reset_index(drop=True)], axis=1)
df_full.head()

Unnamed: 0,10,100,1005,1007,1016,1027,103,1033,1034,1044,...,962,965,970,977,98,980,987,target authors,venue,coauthors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[45],0.004301,"[1359, 15881]"
1,0.13284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[44, 2]",0.0,"[9641, 5623]"
2,0.188034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[49],0.015054,"[14738, 14099, 3438]"
3,0.065414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[95],0.021505,"[511, 18249]"
4,0.052196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[9, 30]",0.090323,[15587]


### Author Preprocessing

In [16]:
def construct_collaborator_dictionary(df: pd.DataFrame):
    """
    Constructs a database of collobaroter for given author id key. 
    """
    
    collaboraters = {}
    authors       = np.arange(100)
    
    for author in authors:
        df_auth = df.copy(deep=True)
        df_auth["label"] = df_auth["target authors"].apply(lambda x: 1 if author in x else 0)
        df_auth = df_auth[df_auth["label"] == 1]
        coauths = list(set(df_auth["coauthors"].sum()))
        collaboraters[author] = coauths
    
    return collaboraters

In [17]:
collaborator_db = construct_collaborator_dictionary(df_full)

### Model Validation

In [18]:
def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train

In [143]:
def get_train_val_data(author: int, df:pd.DataFrame, split: float):
    # take copy and prepare label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    # map number of collaborators for this given instance 
    collabs = collaborator_db[author]
    df["num collaborators"] = df["coauthors"].apply(lambda x: len(set(x).intersection(collabs)))
    
    # drop irrelevant columns
    X = df.drop(["label", "target authors", "coauthors"], axis=1)
    y = df["label"]
    
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split, random_state=RANDOM_STATE)
    
    # upsample to deal with class imbalance
    X_train, y_train = upsample_training(X_train, y_train)
    
    # we convert to numpy arrays for fitting to sklearn models
    # the reason for this is that sklearn throws annoying warnings otherwise
    return np.array(X_train), np.array(X_val), np.array(y_train), np.array(y_val)

In [20]:
def validate_to_csv(df: pd.DataFrame, n_epochs):
    """
    As we are building 100 classifiers, printing f1 scores within a notebook is impractical. 
    following function writes results to csv. 
    """
    
    with open("validation-XGB.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Author Id','F1 score']
        writer.writerow(header)
        
        # loop over each author, build classifier and write to output
        authors = np.arange(100)
        
        avg = 0
        for author in tqdm(authors):
            X_train, X_val, y_train, y_val = get_train_val_data(author, df)
            
            # tree method enables use of gpu to reduce time of training by 70% 
            clf = XGBClassifier(eval_metric='logloss', tree_method='gpu_hist')
            clf.fit(X_train, y_train)
            
            y_pred = clf.predict(X_val)
            
            f1 = f1_score(y_pred, y_val)
            avg += f1
            writer.writerow([author, f1])
        writer.writerow(["average", avg])
    return

In [17]:
data = df_full
validate_to_csv(data, 2)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
100%|██████████| 100/100 [03:27<00:00,  2.08s/it]


### Author Postprocessing
We output a list of authors as our predictions, postprocessing verifies that these predictions make sense, and only ensures that we keep predictions that are within a given degree of separation of one another. This is not applied until after predictions. Given we already have computed our author DB, we can easily compute degree of separation using an Online implementation of Dijkstra's algorithm

In [21]:
from collections import defaultdict
from dijkstra import Dijkstra

In [54]:
path = "data/train.json"
train = load_data_set(path)
train.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


In [55]:
def filter_authors_postprocess(authors: List[int], author: int):
    return list(filter(lambda x: x!= author, authors))

def build_full_colaborator_db(df):
    """
    Constructs a database of collobaroter for given author id key. 
    """
    
    collaboraters = {}
    authors       = np.arange(21246)
    
    for author in tqdm(authors):
        df_auth               = df.copy(deep=True)
        df_auth["label"]      = df_auth["authors"].apply(lambda x: 1 if author in x else 0)
        df_auth               = df_auth[df_auth["label"] == 1]
        df_auth["coauthors"]  = df["authors"].apply(lambda x: filter_authors_postprocess(x, author))
        coauths               = list(set(df_auth["coauthors"].sum()))
        collaboraters[author] = coauths
    
    return collaboraters

In [56]:
full_colaborator_db = build_full_colaborator_db(train)

100%|██████████| 21246/21246 [3:00:12<00:00,  1.96it/s]   


In [154]:
from sys import maxsize
from ast import literal_eval

def open_author_graph(path: str):
    """
    reads in author graph. 
    
    WARNING: Takes a long time to execute and can't slap a tqdm ~ 7.5 mins 
    """
    
    
    # pythons csv library sets a max size for the columns, which we exceed
    # this loop overwrites this so we can read in the columns
    """
    while True:
        try:
            csv.field_size_limit(maxsize)
        except OverflowError:
            field_size_limit = int(maxsize / 10)
    """
    # now load in the author graph
    author_graph = {}
    
    with open('author_graph.csv') as csv_file:
        reader = csv.reader(csv_file)
        auths = []
        for row in reader:
            author = int(row[0])
            edges  = literal_eval(row[1]) 
            author_graph[author] = edges
        
    return author_graph

### Build Binary Classifiers

In [248]:
data.head()

Unnamed: 0,10,100,1005,1007,1016,1027,103,1033,1034,1044,...,962,965,970,977,98,980,987,target authors,venue,coauthors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[45],0.004301,"[1359, 15881]"
1,0.13284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[44, 2]",0.0,"[9641, 5623]"
2,0.188034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[49],0.015054,"[14738, 14099, 3438]"
3,0.065414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[95],0.021505,"[511, 18249]"
4,0.052196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[9, 30]",0.090323,[15587]


In [141]:
data = df_full

In [144]:
models = []
authors = np.arange(100)

for author in tqdm(authors):
    X_train, X_val, y_train, y_val = get_train_val_data(author, data, split=0.05)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    models.append(clf)

100%|██████████| 100/100 [02:17<00:00,  1.38s/it]


### Build Predications

In [174]:
path = "data/test.json"
df_test = load_data_set(path)
df_test = preprocess(df_test, train=False)
df_test.head()

loaded 800 instances


Unnamed: 0,identifier,coauthors,venue,text,str text
0,0,"[16336, 1762, 4357, 12564]",0.47957,"[3207, 24, 1798, 1738, 37, 2375, 1568, 11, 53,...",3207 24 1798 1738 37 2375 1568 11 53 1584 1903...
1,1,"[21189, 14088]",0.47957,"[40, 1560, 1536, 1544, 1609, 1705, 1658, 1543,...",40 1560 1536 1544 1609 1705 1658 1543 52 11 33...
2,2,"[3625, 1198, 19889, 794, 2749, 7801]",0.015054,"[47, 1574, 1729, 1641, 11, 37, 2533, 2015, 47,...",47 1574 1729 1641 11 37 2533 2015 47 1930 1549...
3,3,"[19810, 15173, 5876, 111]",0.045161,"[1770, 53, 2054, 1549, 1529, 1723, 2796, 1547,...",1770 53 2054 1549 1529 1723 2796 1547 1543 47 ...
4,4,"[10932, 7668, 11907, 19601, 15307, 10492, 1049...",1.0,"[18, 1924, 23, 1544, 3927, 2686, 1543, 1535, 1...",18 1924 23 1544 3927 2686 1543 1535 1660 1548 ...


In [175]:
# this complicated looking lambda simply removes any words that were not part of our preprocessing. 
# failing to do so, would pass an unseen word to our tf-idf vectoriser and would crash our program
tfidf_features  = tfidf.get_feature_names_out()
df_test['text'] = df_test['text'].apply(lambda xs: list(filter((lambda x: str(x) in tfidf_features), xs)))
df_test["text"] = df_test["text"].apply(lambda xs: ''.join(str(x)+' ' for x in xs))

In [176]:
# now apply the tf-idf transformation to the text component
X_test = tfidf.transform(df_test['text'])
X_test = pd.DataFrame((X_test.todense().tolist()), columns=tfidf_features)

In [177]:
# now put everything back together
test = pd.concat([X_test, df_test["venue"], df_test["coauthors"]], axis=1)

In [243]:
def make_predictions(test_df: pd.DataFrame):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("predictions.csv"):
        os.remove("predictions.csv")
        print("removed previous predictions")
    
    
    with open("predictions.csv", mode='w', newline='') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predict']
        writer.writerow(header)
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            
            # we need to keep x as a dataframe for this model so we can apply the collobartor mapping easily
            x   = test_df.iloc[Id].to_frame().T
            row = [Id]
            authors = ""
            candidates = []
            
            
            for author, model in enumerate(models):
                # map number of collaborators for this given instance 
                X = x.copy(deep = True)
                collabs = collaborator_db[author]
                X["num collaborators"] = X["coauthors"].apply(lambda x: len(set(x).intersection(collabs)))
                X = X.drop(["coauthors"], axis=1)
                X = np.array(X).reshape(1, -1)
               
                if np.array(model.predict(X)).item() == 1:
                    authors += str(author) + " "
                    prob     = model.predict_proba(X).tolist()[0][1]  # returns probability of success
                    candidates.append((author, prob))

            # to match the output requirement 
            if len(authors) == 0: row = [Id, -1]
            else: 
                authors = postprocess(candidates)
                row = [Id, authors]
            
            writer.writerow(row)
    return

In [251]:
def postprocess(predictions: List):
    """
    post processes predictions
    """
    
    # if we make a single prediction, just return it as there is no postprocessing necessary
    if len(predictions) == 1: return str(predictions[0][0])
    
    # max degree of separation we allow between lead authors
    max_degree_sep = 1
    
    predictions.sort(key=lambda x: x[1], reverse=True)
    authors = [author[0] for author in predictions]
    
    # author with the highes probability of being in the paper
    lead_author = authors[0]
    
    # degrees of separation between lead author and other authors if other authors 
    final_preds = []
    degrees_sep = Dijkstra(author_graph, lead_author)
    
    for author in authors:
        if degrees_sep[author] <= max_degree_sep:
            final_preds.append(author)
    
    return ''.join(str(x)+" " for x in final_preds)


In [252]:
make_predictions(test)

removed previous predictions


100%|██████████| 800/800 [02:33<00:00,  5.20it/s]
