In [1]:
import json
import os
import csv, math
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import defaultdict as dd
from typing import List

NUM_WORDS = 5000
NUM_AUTHORS = 21246
MAX_LEN = 250
RANDOM_STATE = 42069

In [2]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [3]:
path = "train.json"
train = load_data_set(path)
train.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


In [4]:
def refine(text_ser):

    count = dd(int)

    #Performs feature selection on items.
    #print(list(text_ser["text"]))
    for i in list(text_ser.index):
        #obtain the list
        #print(text_ser["text"])
        id_lst = list(text_ser.loc[i, "text"])
        for item in id_lst:
            count[item] += 1
    #Then do the filtering:
    #lst1 = sorted(lst1, reverse=True)
    lim = math.ceil(np.quantile(list(count.values()), 0.99))
    print("The 95 percent frequency quantile is", lim)
    restr = []
    for item, val in count.items():
        if val <= lim:
            restr.append(item)

    text_ser["text"] = text_ser["text"].apply(lambda x: filter_text(x, restr))
    ##Repeat going through the lists:
    #for i in range(len(text_ser.index)):
    #    #obtain the list
    #    id_lst = list(text_ser["text"])[i]
    #    text_ser.loc[i, "text"] = [x for x in id_lst if x <= lim]
    #print("max", text_ser["text"].max(), len(text_ser["text"].max()))
    return

def filter_text(lst, restr):
    lst_filter = filter(lambda x: x in restr, lst)
    return list(lst_filter)


In [5]:
#refine(train)


## Preprocessing

In [6]:
def preprocess(df: pd.DataFrame, train=True, drop_samples=False):
    
    df = df.copy(deep=True)
   
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # drops samples containing no prolific authors, Reduces training set by ~60% to 7000 samples
    if drop_samples:
        df["has target"] = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["has target"], axis=1)
        
    # text transormation
    # we stringify the list of int's to be used as inputs to the TF-IDF vectoriser
    #But beforehand, perform feature selection.
    df["text"] = df["title"] + df["abstract"]
    #Do a word count:
    refine(df)
    #print("max", df["text"].max(), len(df["text"].max()))
    df["str text"] = df["text"].apply(lambda xs: ''.join(str(x)+' ' for x in xs))
    
    # preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))

    # drop
    df = df.drop(["abstract", "title", "year"], axis=1)
    return df

In [7]:
def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)

In [8]:
df = preprocess(train, train=True, drop_samples=True)
df.head()

The 95 percent frequency quantile is 3827


Unnamed: 0,venue,target authors,coauthors,text,str text
0,0.043011,"[42, 36]",[13720],"[1550, 1563, 1594, 1544, 1919, 1644, 1539, 171...",1550 1563 1594 1544 1919 1644 1539 1715 1541 1...
1,0.004301,[45],"[1359, 15881]","[57, 4624, 3522, 2223, 1653, 1691, 2449, 3616,...",57 4624 3522 2223 1653 1691 2449 3616 2206 190...
3,0.008602,[97],[],"[1733, 1735, 1540, 1624, 56, 1687, 1644, 1624,...",1733 1735 1540 1624 56 1687 1644 1624 4226 162...
4,0.019355,[2],[19617],"[38, 1592, 2088, 1574, 1727, 1597, 1813, 1926,...",38 1592 2088 1574 1727 1597 1813 1926 1623 162...
9,0.0,"[44, 2]","[9641, 5623]","[1560, 1694, 3066, 1728, 1603, 1594, 1531, 153...",1560 1694 3066 1728 1603 1594 1531 1532 2021 1...


## TF-IDF Preprocessing

TF-IDF is a nlp preprocessing method to map text input to a vector of reals. TF-IDF is an improvement upon previous feature engineering that we have performed as it adjusts the value of each word, relative to how freuently it occurs. Stop words such as *the, it, how* have relatively low weightings, so the resulting vector only captures the most *important* words within the input. This (typically) leads to TF-IDF representations outperforming word-count representations for most tasks

We use `sklearn`'s feature extraction to automate this process. 

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf   = TfidfVectorizer()
vectors = tfidf.fit_transform(df["str text"])

In [10]:
feature_names = tfidf.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_text = pd.DataFrame(denselist, columns=feature_names)
df_text.head()

Unnamed: 0,100,1005,1006,1007,1009,101,1014,1016,1022,1024,...,962,965,968,970,973,977,98,980,987,998
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df_auth_venue = df[["target authors", "venue", "coauthors"]]
df_full       = pd.concat([df_text.reset_index(drop=True), df_auth_venue.reset_index(drop=True)], axis=1)
df_full.head()

Unnamed: 0,100,1005,1006,1007,1009,101,1014,1016,1022,1024,...,970,973,977,98,980,987,998,target authors,venue,coauthors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[42, 36]",0.043011,[13720]
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[45],0.004301,"[1359, 15881]"
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[97],0.008602,[]
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[2],0.019355,[19617]
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[44, 2]",0.0,"[9641, 5623]"


## Author Preprocessing

We previously used a binning strategy of dealing with coauthors. This time, we instead use a simple count of how many coauthors (with `id>100`) they have previously colaborated with appear in the paper. To do this, we build a dictionary of lists of coauthors for each prolific author.

In [12]:
def construct_collaborator_dictionary(df: pd.DataFrame):
    """
    Constructs a database of collobaroter for given author id key. 
    """
    
    collaboraters = {}
    authors       = np.arange(100)
    
    for author in authors:
        df_auth = df.copy(deep=True)
        df_auth["label"] = df_auth["target authors"].apply(lambda x: 1 if author in x else 0)
        df_auth = df_auth[df_auth["label"] == 1]
        coauths = list(set(df_auth["coauthors"].sum()))
        collaboraters[author] = coauths
    
    return collaboraters

In [13]:
collaborator_db = construct_collaborator_dictionary(df_full)

## Model Validation

We now build and validate a model. We use a `RANDOM_STATE` seed to ensure we generate the same training/evaluation split. We write our results to a csv file to avoid an unecessarily long notebook

In [14]:
def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def resample_training(X_train, y_train):
    """
    resamples class imbalance using SMOTE: 
    https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
    """
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train

In [15]:
def get_train_val_data(author: int, df:pd.DataFrame):
    # take copy and prepare label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    # map number of collaborators for this given instance 
    collabs = collaborator_db[author]
    df["num collaborators"] = df["coauthors"].apply(lambda x: len(set(x).intersection(collabs)))
    
    # drop irrelevant columns
    X = df.drop(["label", "target authors", "coauthors"], axis=1)
    y = df["label"]
    
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
    
    # upsample to deal with class imbalance
    X_train, y_train = upsample_training(X_train, y_train)
    
    # we convert to numpy arrays for fitting to sklearn models
    # the reason for this is that sklearn throws annoying warnings otherwise
    return np.array(X_train), np.array(X_val), np.array(y_train), np.array(y_val)

In [16]:
def validate_to_csv(df: pd.DataFrame):
    """
    As we are building 100 classifiers, printing f1 scores within a notebook is impractical. 
    following function writes results to csv. 
    """
    
    with open("validation99.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Author Id','F1 score']
        writer.writerow(header)
        
        # loop over each author, build classifier and write to output
        authors = np.arange(100)
        
        avg = 0
        for author in tqdm(authors):
            X_train, X_val, y_train, y_val = get_train_val_data(author, df)
            
            clf = LogisticRegression()
            clf.fit(X_train, y_train)
            
            y_pred = clf.predict(X_val)
            
            f1 = f1_score(y_pred, y_val)
            avg += f1
            writer.writerow([author, f1])
        writer.writerow(["average", avg])
    return

In [17]:
validate_to_csv(df_full)

100%|██████████| 100/100 [08:32<00:00,  5.12s/it]


## Build Binary Classifiers

We now build the 100 binary classifers, one for each author

In [18]:
models = []
authors = np.arange(100)

for author in tqdm(authors):
    X_train, X_val, y_train, y_val = get_train_val_data(author, df_full)#, split=0.1)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    models.append(clf)

100%|██████████| 100/100 [09:10<00:00,  5.50s/it]


## Build Predictions

We now load in our `test.json` set to generate predictions. We first put the training data through the usual preprocessing pipeline and exclude vocabular that was not part of our `tf-idf` training lexicon

In [19]:
path = "test.json"
df_test = load_data_set(path)
df_test = preprocess(df_test, train=False)
df_test.head()

loaded 800 instances
The 95 percent frequency quantile is 558


Unnamed: 0,identifier,coauthors,venue,text,str text
0,0,"[16336, 1762, 4357, 12564]",0.47957,"[3207, 24, 1798, 1738, 2375, 1568, 53, 1584, 1...",3207 24 1798 1738 2375 1568 53 1584 1903 4522 ...
1,1,"[21189, 14088]",0.47957,"[40, 1560, 1536, 1544, 1609, 1705, 52, 1580, 4...",40 1560 1536 1544 1609 1705 52 1580 44 1805 21...
2,2,"[3625, 1198, 19889, 794, 2749, 7801]",0.015054,"[1574, 1729, 1641, 2533, 2015, 1930, 1595, 153...",1574 1729 1641 2533 2015 1930 1595 1536 1532 1...
3,3,"[19810, 15173, 5876, 111]",0.045161,"[1770, 53, 2054, 1723, 2796, 1547, 1730, 1575,...",1770 53 2054 1723 2796 1547 1730 1575 1785 153...
4,4,"[10932, 7668, 11907, 19601, 15307, 10492, 1049...",1.0,"[18, 1924, 23, 1544, 3927, 2686, 1660, 1548, 3...",18 1924 23 1544 3927 2686 1660 1548 3691 3037 ...


In [20]:
# this complicated looking lambda simply removes any words that were not part of our preprocessing. 
# failing to do so, would pass an unseen word to our tf-idf vectoriser and would crash our program
tfidf_features  = tfidf.get_feature_names_out()
df_test['text'] = df_test['text'].apply(lambda xs: list(filter((lambda x: str(x) in tfidf_features), xs)))
df_test["text"] = df_test["text"].apply(lambda xs: ''.join(str(x)+' ' for x in xs))

In [21]:
# now apply the tf-idf transformation to the text component
X_test = tfidf.transform(df_test['text'])
X_test = pd.DataFrame((X_test.todense().tolist()), columns=tfidf_features)

In [22]:
# now put everything back together
test = pd.concat([X_test, df_test["venue"], df_test["coauthors"]], axis=1)

In [23]:
def make_predictions(test_df: pd.DataFrame):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("predictions99.csv"):
        os.remove("predictions99.csv")
        print("removed previous predictions")
    
    
    with open("predictions99.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predict']
        writer.writerow(header)
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            
            # we need to keep x as a dataframe for this model so we can apply the collobartor mapping easily
            x   = test_df.iloc[Id].to_frame().T
            row = [Id]
            authors = ""
            
            for author, model in enumerate(models):
                # map number of collaborators for this given instance 
                X = x.copy(deep = True)
                collabs = collaborator_db[author]
                X["num collaborators"] = X["coauthors"].apply(lambda x: len(set(x).intersection(collabs)))
                X = X.drop(["coauthors"], axis=1)
                X = np.array(X).reshape(1, -1)
               
                if np.array(model.predict(X)).item() == 1:
                    authors += str(author) + " "

            # to match the output requirement 
            if len(authors) == 0: row = [Id, -1]
            else: row = [Id, authors]
            
            writer.writerow(row)
    return

In [24]:
make_predictions(test)

100%|██████████| 800/800 [02:25<00:00,  5.50it/s]
