In [1]:
import json
import os
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from typing import List

NUM_WORDS = 5000
NUM_AUTHORS = 21246
MAX_LEN = 250
RANDOM_STATE = 42069

In [2]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [3]:
path = "../data/train.json"
train = load_data_set(path)
train.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


In [7]:
def preprocess(df: pd.DataFrame, train=True, drop_samples=False):
    
    df = df.copy(deep=True)
   
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # drops samples containing no prolific authors, Reduces training set by ~60% to 7000 samples
    if drop_samples:
        df["has target"] = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["has target"], axis=1)
        
    # text transormation
    # we stringify the list of int's to be used as inputs to the TF-IDF vectoriser
    df["text"] = df["title"] + df["abstract"]
    df["str text"] = df["text"].apply(lambda xs: ''.join(str(x)+' ' for x in xs))
    
    # preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))
    
    # prepocessing for coauthors
    # we use a discretised binning strategy, with n=10 bins by default. 
    df["coauthors"] = df["coauthors"].apply(lambda x: build_bins(x, n_bins=10))
    coauth_df = pd.DataFrame(df.coauthors.tolist(), index=df.index, columns=["bin "+str(i) for i in range(10)])

    # drop
    df = df.drop(["abstract", "title", "year"], axis=1)
    return df

In [8]:
# feature transformation

def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)
    
def build_bins(coauthors: List[int], n_bins=10):
    """
    takes a list of coauthors and returns 10-column data frame
    
    This might be some of the uggliest code I have ever written, though
    sklearn's discrete bins didn't really give what I wanted
    """
    width = np.ceil(21246/n_bins)
    bins  = np.zeros(n_bins)
    for author in coauthors:
        i = 0
        while not (max(0,(i-1))*width <= author <= i*width):
            i += 1
        bins[i-1] += 1
    return bins

In [9]:
df = preprocess(train, drop_samples=True)
df.head()

Unnamed: 0,venue,target authors,coauthors,text,str text
0,0.043011,"[42, 36]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1...",41 1550 1563 1594 1544 1919 1644 37 1539 1715 ...
1,0.004301,[45],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3...",1731 47 11 57 4624 1525 1535 47 11 3522 2223 1...
3,0.008602,[97],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5...",40 1733 1735 1540 1655 46 1624 1547 56 1687 16...
4,0.019355,[2],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,...",38 1592 2088 1543 1574 1727 1597 1813 1926 152...
9,0.0,"[44, 2]","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[1560, 1694, 11, 1546, 11, 3066, 1728, 47, 160...",1560 1694 11 1546 11 3066 1728 47 1603 1553 11...


In [None]:
df.shape

### TF-IDF

TF-IDF is a nlp preprocessing method to map text input to a vector of reals. TF-IDF is an improvement upon previous feature engineering that we have performed as it adjusts the value of each word, relative to how freuently it occurs. Stop words such as *the, it, how* have relatively low weightings, so the resulting vector only captures the most *important* words within the input. This (typically) leads to TF-IDF representations outperforming word-count representations for most tasks

We use `sklearn`'s feature extraction to automate this process. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf   = TfidfVectorizer()
vectors = tfidf.fit_transform(df["str text"])

In [None]:
feature_names = tfidf.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_text = pd.DataFrame(denselist, columns=feature_names)

In [None]:
df_text.head()

In [None]:
df_auth = df["target authors"]
df_full = pd.concat([df_text.reset_index(drop=True), df_auth.reset_index(drop=True)], axis=1)
df_full.head()

In [None]:
df_full.shape

### Model Fitting

let's assess results for a single author

In [None]:
def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def resample_training(X_train, y_train):
    """
    resamples class imbalance using SMOTE: 
    https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
    """
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train

In [None]:
def get_train_val_data(author: int, df:pd.DataFrame):
    # take copy and prepare label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    X = df.drop(["label", "target authors"], axis=1)
    y = df["label"]
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=RANDOM_STATE)
    
    # upsample to deal with class imbalance
    X_train, y_train = upsample_training(X_train, y_train)
    
    return X_train, X_val, y_train, y_val

In [None]:
test_author = 25
data        = df_full
X_train, X_val, y_train, y_val = get_train_val_data(test_author, data)

In [None]:
y_train.value_counts()

In [None]:
y_val.value_counts()

In [None]:
clf = LogisticRegression(penalty='l1', solver='liblinear')
clf.fit(X_train, y_train)

In [None]:
y_preds = clf.predict(X_val)

In [None]:
np.unique(y_preds)

In [None]:
print(f"accuracy: {accuracy_score(y_preds, y_val)}")
print(f"f1      : {f1_score(y_preds, y_val)}")
print("confusion matrix:")
print(pd.DataFrame(confusion_matrix(y_preds, y_val)))

In [None]:
clf3 = RandomForestClassifier(n_estimators=1000, max_depth=6)
clf3.fit(X_train, y_train)

In [None]:
y_preds = clf3.predict(X_val)
print(f"accuracy: {accuracy_score(y_preds, y_val)}")
print(f"f1      : {f1_score(y_preds, y_val)}")
print("confusion matrix:")
print(pd.DataFrame(confusion_matrix(y_preds, y_val)))

In [None]:
clf4 = GradientBoostingClassifier()
clf4.fit(X_train, y_train)

In [None]:
y_preds = clf4.predict(X_val)
print(f"accuracy: {accuracy_score(y_preds, y_val)}")
print(f"f1      : {f1_score(y_preds, y_val)}")
print("confusion matrix:")
print(pd.DataFrame(confusion_matrix(y_preds, y_val)))

## Building the binary classifiers

In [None]:
def build_classifier(author: int, df: pd.DataFrame):
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    X = df.drop(["label", "target authors"], axis=1)
    y = df["label"]
    
    X, y = resample_training(X, y)
    
    clf = LogisticRegression(penalty='l1',solver='liblinear')
    clf.fit(X, y)
    return clf

In [None]:
models = []
authors = np.arange(100)

for author in tqdm(authors):
    classifier = build_classifier(author, df_full)
    models.append(classifier)

## Building the predictions

In [None]:
path = "../data/test.json"
df_test = load_data_set(path)
df_test["text"] = df_test["title"] + df_test["abstract"]
df_test.head()

In [None]:
# we now need to filter out the text column based on our TF-IDF features
tfidf_features = tfidf.get_feature_names_out()
tfidf_features

In [None]:
# this complicated looking lambda simply removes any words that were not part of our preprocessing. 
# failing to do so, would pass an unseen word to our tf-idf vectoriser and would crash our program
df_test['text'] = df_test['text'].apply(lambda xs: list(filter((lambda x: str(x) in tfidf_features), xs)))
df_test["text"] = df_test["text"].apply(lambda xs: ''.join(str(x)+' ' for x in xs))

In [None]:
X_test = tfidf.transform(df_test['text'])
X_test = pd.DataFrame((X_test.todense().tolist()), columns=tfidf_features)

In [None]:
def make_predictions(test_df: pd.DataFrame):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("predictions.csv"):
        os.remove("predictions.csv")
        print("removed previous predictions")
    
    
    with open("predictions.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predict']
        writer.writerow(header)
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            x   = np.array(X_test.iloc[Id]).reshape(1, -1)
            row = [Id]
            authors = ""
            for author, model in enumerate(models):
                if np.array(model.predict(x)).item() == 1:
                    authors += str(author) + " "

            # to match the output requirement 
            if len(authors) == 0: row = [Id, -1]
            else: row = [Id, authors]
            
            writer.writerow(row)
    return

In [None]:
make_predictions(X_test)

In [None]:
X_test