In [1]:
import json
import os
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC, LinearSVC
from typing import List
import matplotlib.pyplot as plt

NUM_WORDS = 5000
NUM_AUTHORS = 21246
MAX_LEN = 250
RANDOM_STATE = 42069

In [2]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [3]:
path = "train.json"
train = load_data_set(path)
train.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


## Preprocessing

In [4]:
def preprocess(df: pd.DataFrame, train=True, drop_samples=False):
    
    df = df.copy(deep=True)
   
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # drops samples containing no prolific authors, Reduces training set by ~60% to 7000 samples
    if drop_samples:
        df["has target"] = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["has target"], axis=1)
        
    # text transormation
    # we stringify the list of int's to be used as inputs to the TF-IDF vectoriser
    df["text"] = df["title"] + df["abstract"]
    df["str text"] = df["text"].apply(lambda xs: ''.join(str(x)+' ' for x in xs))
    
    # preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))

    # drop
    df = df.drop(["abstract", "title"], axis=1)
    return df

In [5]:
def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)

In [6]:
df = preprocess(train, train=True, drop_samples=True)

In [7]:
df["year"].unique()

array([ 9, 15, 10, 18, 11,  8,  2, 13,  1, 16,  6,  5, 17, 14,  3, 12,  4,
        7,  0])

## TF-IDF Preprocessing

TF-IDF is a nlp preprocessing method to map text input to a vector of reals. TF-IDF is an improvement upon previous feature engineering that we have performed as it adjusts the value of each word, relative to how freuently it occurs. Stop words such as *the, it, how* have relatively low weightings, so the resulting vector only captures the most *important* words within the input. This (typically) leads to TF-IDF representations outperforming word-count representations for most tasks

We use `sklearn`'s feature extraction to automate this process. 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf   = TfidfVectorizer()
vectors = tfidf.fit_transform(df["str text"])

In [9]:
feature_names = tfidf.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_text = pd.DataFrame(denselist, columns=feature_names)
df_text.head()

Unnamed: 0,10,100,1005,1006,1007,1009,101,1014,1016,1022,...,962,965,968,970,973,977,98,980,987,998
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.090374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.018773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.130861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_auth_venue = df[["target authors", "venue", "coauthors", "year"]]
df_full       = pd.concat([df_text.reset_index(drop=True), df_auth_venue.reset_index(drop=True)], axis=1)
df_full.head()

Unnamed: 0,10,100,1005,1006,1007,1009,101,1014,1016,1022,...,973,977,98,980,987,998,target authors,venue,coauthors,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,"[42, 36]",0.043011,[13720],9
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,[45],0.004301,"[1359, 15881]",15
2,0.090374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,[97],0.008602,[],10
3,0.018773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,[2],0.019355,[19617],10
4,0.130861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,"[44, 2]",0.0,"[9641, 5623]",18


## Author Preprocessing

We previously used a binning strategy of dealing with coauthors. This time, we instead use a simple count of how many coauthors (with `id>100`) they have previously colaborated with appear in the paper. To do this, we build a dictionary of lists of coauthors for each prolific author.

In [11]:
def construct_collaborator_dictionary(df: pd.DataFrame):
    """
    Constructs a database of collobaroter for given author id key. 
    """
    
    collaboraters = {}
    authors       = np.arange(100)
    
    for author in authors:
        df_auth = df.copy(deep=True)
        df_auth["label"] = df_auth["target authors"].apply(lambda x: 1 if author in x else 0)
        df_auth = df_auth[df_auth["label"] == 1]
        coauths = list(set(df_auth["coauthors"].sum()))
        collaboraters[author] = coauths
    
    return collaboraters

In [12]:
collaborator_db = construct_collaborator_dictionary(df_full)

## Model Validation

We now build and validate a model. We use a `RANDOM_STATE` seed to ensure we generate the same training/evaluation split. We write our results to a csv file to avoid an unecessarily long notebook

In [13]:
def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def resample_training(X_train, y_train):
    """
    resamples class imbalance using SMOTE: 
    https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
    """
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train

In [14]:
def get_train_val_data(author: int, df:pd.DataFrame):
    # take copy and prepare label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    # map number of collaborators for this given instance 
    collabs = collaborator_db[author]
    df["num collaborators"] = df["coauthors"].apply(lambda x: len(set(x).intersection(collabs)))
    
    # drop irrelevant columns
    X = df.drop(["label", "target authors", "coauthors", "year"], axis=1)
    y = df["label"]
    
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
    
    # upsample to deal with class imbalance
    X_train, y_train = upsample_training(X_train, y_train)
    
    # we convert to numpy arrays for fitting to sklearn models
    # the reason for this is that sklearn throws annoying warnings otherwise
    return np.array(X_train), np.array(X_val), np.array(y_train), np.array(y_val)

In [15]:
def validate_to_csv(df: pd.DataFrame):#, restr_lst):
    """
    As we are building 100 classifiers, printing f1 scores within a notebook is impractical. 
    following function writes results to csv. 
    """
    author_lst = []
    with open("Results/validation_yearLogistic.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Author Id','F1 score']
        writer.writerow(header)
        
        # loop over each author, build classifier and write to output
        authors = np.arange(100)
        
        avg = 0
        for author in tqdm(authors):

            X_train, X_val, y_train, y_val = get_train_val_data(author, df)
            
            clf = LogisticRegression()
            clf.fit(X_train, y_train)
            
            y_pred = clf.predict(X_val)
            
            f1 = f1_score(y_pred, y_val)
            avg += f1
            writer.writerow([author, f1])

            #To see the changes to the low-F1 authors after restricting on year
            if f1 <= 0.5:
                author_lst.append(author)

        writer.writerow(["average", avg])
    return author_lst

In [16]:
df_full.head()

Unnamed: 0,10,100,1005,1006,1007,1009,101,1014,1016,1022,...,973,977,98,980,987,998,target authors,venue,coauthors,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,"[42, 36]",0.043011,[13720],9
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,[45],0.004301,"[1359, 15881]",15
2,0.090374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,[97],0.008602,[],10
3,0.018773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,[2],0.019355,[19617],10
4,0.130861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,"[44, 2]",0.0,"[9641, 5623]",18


In [17]:
def print_function(author, df):
    #for item in author_lst:
    # take copy and prepare label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    df_val = df[(df["label"] == 1)] 
    df_plot = df[(df["label"] == 1) & (df["year"] >= 9)] 
    print("length", len(df_plot))
    print("length val", len(df_val))

    #Then plot based on those the author is in
    plt.hist(df_val["year"], bins=np.arange(df_val["year"].min(), df_val["year"].max()))
    return

def restrict_function(restr_lst, df):
    #for item in author_lst:
    # take copy and prepare label
    for author in restr_lst:
        df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
        df = df.drop(df[(df["year"] < 13) & (df["label"] == 1)].index)
        df = df.drop(["label"], axis=1)
    return df

#From using ModelOriginal.ipynb with logistic regression the following authors are seen to have an average F1 of at most 0.5:
restr_lst = [71, 18, 28, 86, 62, 3, 81, 51, 34, 6, 20, 99] 

In [18]:
#print_function(restr_lst[0], df_full) 

In [19]:
df_restr = restrict_function(restr_lst, df_full)

result = validate_to_csv(df_restr)
print(result)
#[(0.3333333333333333, 71), (0.34782608695652173, 18), (0.3529411764705882, 28), 
# (0.3529411764705882, 86), (0.3555555555555555, 62), (0.3846153846153846, 3), 
# (0.4, 81), (0.4761904761904762, 51), (0.48484848484848486, 34), (0.5, 6), (0.5, 20), (0.5, 99)]

100%|██████████| 100/100 [07:30<00:00,  4.50s/it]

[6, 13, 17, 18, 38, 48, 71, 80, 86, 89, 91, 99]





## Build Binary Classifiers

We now build the 100 binary classifers, one for each author

In [20]:
models = []
authors = np.arange(100)

for author in tqdm(authors):
            
    X_train, X_val, y_train, y_val = get_train_val_data(author, df_restr)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    models.append(clf)

100%|██████████| 100/100 [07:37<00:00,  4.57s/it]


## Build Predictions

We now load in our `test.json` set to generate predictions. We first put the training data through the usual preprocessing pipeline and exclude vocabular that was not part of our `tf-idf` training lexicon

In [21]:
path = "test.json"
df_test = load_data_set(path)
df_test = preprocess(df_test, train=False)
df_test.head()

loaded 800 instances


Unnamed: 0,identifier,coauthors,year,venue,text,str text
0,0,"[16336, 1762, 4357, 12564]",19,0.47957,"[3207, 24, 1798, 1738, 37, 2375, 1568, 11, 53,...",3207 24 1798 1738 37 2375 1568 11 53 1584 1903...
1,1,"[21189, 14088]",19,0.47957,"[40, 1560, 1536, 1544, 1609, 1705, 1658, 1543,...",40 1560 1536 1544 1609 1705 1658 1543 52 11 33...
2,2,"[3625, 1198, 19889, 794, 2749, 7801]",19,0.015054,"[47, 1574, 1729, 1641, 11, 37, 2533, 2015, 47,...",47 1574 1729 1641 11 37 2533 2015 47 1930 1549...
3,3,"[19810, 15173, 5876, 111]",19,0.045161,"[1770, 53, 2054, 1549, 1529, 1723, 2796, 1547,...",1770 53 2054 1549 1529 1723 2796 1547 1543 47 ...
4,4,"[10932, 7668, 11907, 19601, 15307, 10492, 1049...",19,1.0,"[18, 1924, 23, 1544, 3927, 2686, 1543, 1535, 1...",18 1924 23 1544 3927 2686 1543 1535 1660 1548 ...


In [22]:
# this complicated looking lambda simply removes any words that were not part of our preprocessing. 
# failing to do so, would pass an unseen word to our tf-idf vectoriser and would crash our program
tfidf_features  = tfidf.get_feature_names_out()
df_test['text'] = df_test['text'].apply(lambda xs: list(filter((lambda x: str(x) in tfidf_features), xs)))
df_test["text"] = df_test["text"].apply(lambda xs: ''.join(str(x)+' ' for x in xs))

In [23]:
# now apply the tf-idf transformation to the text component
X_test = tfidf.transform(df_test['text'])
X_test = pd.DataFrame((X_test.todense().tolist()), columns=tfidf_features)

In [24]:
# now put everything back together
test = pd.concat([X_test, df_test["venue"], df_test["coauthors"], df_test["year"]], axis=1)
#plt.hist(test["year"], bins=np.arange(test["year"].min(), test["year"].max()))
#test = test.drop(["year"], axis=1)

In [25]:
def make_predictions(test_df: pd.DataFrame):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("Results/predictionsyearLogistic.csv"):
        os.remove("Results/predictionsyearLogistic.csv")
        print("removed previous predictions")
    
    
    with open("Results/predictionsyearLogistic.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predict']
        writer.writerow(header)
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            
            # we need to keep x as a dataframe for this model so we can apply the collobartor mapping easily
            x   = test_df.iloc[Id].to_frame().T
            row = [Id]
            authors = ""
            
            for author, model in enumerate(models):
                # map number of collaborators for this given instance 
                X = x.copy(deep = True)
                collabs = collaborator_db[author]
                X["num collaborators"] = X["coauthors"].apply(lambda x: len(set(x).intersection(collabs)))
                X = X.drop(["coauthors"], axis=1)
                X = np.array(X).reshape(1, -1)
               
                if np.array(model.predict(X)).item() == 1:
                    authors += str(author) + " "

            # to match the output requirement 
            if len(authors) == 0: row = [Id, -1]
            else: row = [Id, authors]
            
            writer.writerow(row)
    return

In [26]:

test = test.drop(["year"], axis=1)
make_predictions(test)

100%|██████████| 800/800 [02:02<00:00,  6.55it/s]


In [27]:
len(df_restr)

7102

In [28]:
len(df_full)

7460

In [29]:
len(df_text)

7460