Instead of the 5000 - length text representation of text, we try a bins approach. Wh

In [84]:
import json
import csv
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from typing import List

RANDOM_STATE = 42069
NUM_WORDS = 5000
NUM_AUTHORS = 21246

In [85]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [86]:
path = "../data/train.json"
df = load_data_set(path)
df.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


**preprocessing**

In [90]:
def preprocess(df: pd.DataFrame, train=True, drop_samples=False, num_text_bins=10, num_auth_bins=10):
    """
    
    """
    # preprocessing for authors
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    if train and drop_samples:
        df["has target"] = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["has target"], axis=1)
    
    # text transormation to bins
    df["text"] = df["title"] + df["abstract"]
    df["text"] = df["text"].apply(lambda x: build_bins(x, NUM_WORDS, num_text_bins))
    col_names = ["text bin "+str(i+1) for i in range(num_text_bins)]
    text_df = pd.DataFrame(df.text.tolist(), index=df.index, columns=col_names)
    
    
    # coauthor transformation to bins
    df["coauthors"] = df["coauthors"].apply(lambda x: build_bins(x, NUM_AUTHORS, num_auth_bins))
    col_names = ["auth bin "+ str(i+1) for i in range(num_auth_bins)]
    coauth_df = pd.DataFrame(df.coauthors.tolist(), index=df.index, columns=col_names)
    
    # venue transformation
    df.loc[df.venue == "", "venue"] = 465
    scalar = MinMaxScaler()
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))
    
    # recombine
    df = df.drop(["abstract", "title", "text", "year", "coauthors"], axis=1)
    df = pd.concat([df, text_df, coauth_df], axis=1)

    # drop id if test set:
    if not train:
        df = df.drop(["identifier"], axis=1)
    
    return df

In [91]:
# FEATURE TRANSFORMATIONS

def build_bins(text: List[int], interval_len, n_bins=10):
    """
    takes a list of text and returns n_bin-column data frame

    This might be some of the uggliest code I have ever written, though
    sklearn's discrete bins didn't really give what I wanted
    """
    width = np.ceil(interval_len / n_bins)
    bins = np.zeros(n_bins)
    for word in text:
        i = 0
        while not (max(0, (i - 1)) * width <= word < i * width):
            i += 1
        bins[i - 1] += 1
    return bins


def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)

In [92]:
df = preprocess(df, num_text_bins=100)
df.head()

Unnamed: 0,venue,target authors,text bin 1,text bin 2,text bin 3,text bin 4,text bin 5,text bin 6,text bin 7,text bin 8,...,auth bin 1,auth bin 2,auth bin 3,auth bin 4,auth bin 5,auth bin 6,auth bin 7,auth bin 8,auth bin 9,auth bin 10
0,0.043011,"[42, 36]",9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.004301,[45],30.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,[],21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.008602,[97],24.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.019355,[2],19.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


**Model Validation**

In [93]:
# RESAMPLING TECHNIQUES TO DEAL WITH CLASS IMBALANCE

def upsample_training(X_train, y_train):
    """
    upsamples the minority class until class balance is achieved
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    pos_upsample = resample(pos, replace=True, n_samples=len(neg), random_state=RANDOM_STATE)
    
    resampled = pd.concat([neg, pos_upsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def downsample_training(X_train, y_train):
    """
    downasamples majority class until class balance is achieved 
    """
    X = pd.concat([X_train, y_train], axis=1)
    
    
    pos = X[X["label"] == 1]
    neg = X[X["label"] == 0]
    
    neg_downsample = resample(neg, replace=True, n_samples=len(pos), random_state=RANDOM_STATE)
    
    resampled = pd.concat([pos, neg_downsample])

    y_train = resampled["label"]
    X_train = resampled.drop(["label"], axis=1)
    return X_train, y_train


def resample_training(X_train, y_train):
    """
    resamples class imbalance using SMOTE: 
    https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
    """
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train

In [94]:
def build_evaluate_classifier(author: int, df:pd.DataFrame):
    # take copy and prepare label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    X = df.drop(["label", "target authors"], axis=1)
    y = df["label"]
    # split training and validation - we have fixed random state for reproducability
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)
    
    # upsample to deal with class imbalance
    X_train, y_train = upsample_training(X_train, y_train)
    
    # fit to model
    clf = LogisticRegression(max_iter=2000)
    clf.fit(X_train, y_train)
    
    # validatite model
    y_pred = clf.predict(X_val)
    f1 = f1_score(y_pred, y_val)
    precision = precision_score(y_pred, y_val)
    recall = recall_score(y_pred, y_val)
    return f1, precision, recall

In [95]:
def validate_to_csv(df: pd.DataFrame):
    """
    As we are building 100 classifiers, printing f1 scores within a notebook is impractical. 
    following function writes results to csv. 
    """
    
    with open("../Results/model 4 - upsample.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Author Id','F1 score']
        writer.writerow(header)
        
        # loop over each author, build classifier and write to output
        authors = np.arange(100)
        avg_f1, avg_recall, avg_precision = 0, 0, 0
        
        for author in tqdm(authors):
            f1, precision, recall = build_evaluate_classifier(author, df)
            writer.writerow([author, f1, precision, recall])
            avg_f1 += f1
            avg_precision += precision
            avg_recall += recall 
            
    print(f"average f1:        {avg_f1/100}")
    print(f"average recall:    {avg_recall/100}")
    print(f"average precision: {avg_precision/100}")
    return

In [96]:
validate_to_csv(df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

average f1:        0.023249309808605626
average recall:    0.011982009374860379
average precision: 0.46725144401846774



