In [1]:
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy, os, csv
from tqdm import tqdm
import statistics
from collections import defaultdict as dd
from sklearn.linear_model import LogisticRegression  
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score 
from sklearn.preprocessing import MinMaxScaler
from typing import List

In [2]:
#Construct an RNN network.
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import time
import math

In [3]:
#Note: Work in progress: do not run code yet!

In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
#Read into a dataframe.

def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)  #Normalize semi-structured JSON data into a flat table.
    return data

In [6]:
# Feature transformations

def filter_authors(authors: List[int], prolifics=True):
    
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)
    
    
def text_to_vector(text: List[int]):
    """
    Converts text to sparse matrix representation
    text: List of integers between 1, 4999
    """
    
    word_vec = np.zeros(5000, dtype=int)
    
    for word in text:
        word_vec[word] += 1 
    
    return word_vec

def statistics_feature(lst: List[int], length=True):
    if length:
        return len(lst)
    else:
        return statistics.stdev(lst)

def pre_processing(df, train=True):
    
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    ########### preprocessing for text
    df["combined text"] = df["title"] + df["abstract"]
    df["word_mean"] = df["combined text"].apply(lambda x: np.mean(x))  
    df["word_count"] = df["combined text"].apply(lambda x: statistics_feature(x))
    df["word_spread"] = df["combined text"].apply(lambda x: statistics_feature(x, length=False))
    df = df.drop(["combined text"], axis=1)
    ###########
    
    df["abstract"] = df["abstract"].apply(lambda x: text_to_vector(x))
    df["title"]    = df["title"].apply(lambda x: text_to_vector(x))
    df["text"]     = df["title"] + df["abstract"]
    text_df = pd.DataFrame(df.text.tolist(), index=df.index)
    
    #preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))
    ##Repeat this for the word_mean, count and spread
    df["word_mean"] = scalar.fit_transform(df["word_mean"].to_numpy().reshape(-1, 1))
    df["word_spread"] = scalar.fit_transform(df["word_spread"].to_numpy().reshape(-1, 1))
    df["word_count"] = scalar.fit_transform(df["word_count"].to_numpy().reshape(-1, 1))
    
    # prepocessing for coauthors
    # we use a discretised binning strategy, with n=10 bins by default. 
    df["coauthors"] = df["coauthors"].apply(lambda x: build_bins(x, n_bins=10))
    coauth_df = pd.DataFrame(df.coauthors.tolist(), index=df.index)
    
    # dropping irrelevant columns & concat with 5000-column text_df
    #df = df.drop(["abstract", "title", "text", "year", "coauthors"], axis=1)
    df = df.drop(["year", "coauthors"], axis=1) #So we use either the abstract, title or text for the encoding.
    
    df = pd.concat([df, text_df, coauth_df], axis=1)
    
    # and drop row identifier if test set
    if not train:
        df = df.drop(["identifier"], axis=1)

    return df
    

def build_bins(coauthors: List[int], n_bins=10):
    """
    takes a list of coauthors and returns 10-column data frame
    
    This might be some of the uggliest code I have ever written, though
    sklearn's discrete bins didn't really give what I wanted
    """
    width = np.ceil(21246/n_bins)
    bins  = np.zeros(n_bins)
    for author in coauthors:
        i = 0
        while not (max(0,(i-1))*width <= author <= i*width):
            i += 1
        bins[i-1] += 1
    return bins
    
    

In [20]:
#Construct tensor vectors for authors (based on Week 9 Workshop code)
all_categories = [0, 1]

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size) # changed input
        self.activation = nn.Tanh() # new
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.activation(self.i2h(combined)) # changed to use activation
        output = self.h2o(hidden) # changed input
        output = self.softmax(output)
        return output, hidden
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)


def constructSample(df, index_lst: int):
    
    n_val = 5000
    lst = df.loc[index_lst, 'abstract']
    label = df.loc[index_lst, 'label']
    
    #Initiate the tensor.
    tensor = torch.zeros(len(lst), 1, n_val)
    
    #Construct the category tensor.
    label_tensor = torch.tensor([all_categories.index(label)], dtype=torch.long)
    
    for i, num in enumerate(lst):
        tensor[i][0][num] = 1
        
    return label, lst, label_tensor, tensor

In [21]:
#Now train the data.
# training

def train_RNN_classifier(category_tensor, lst_tensor):
    ###################use the tensor function here. Work in progress...
    ####Adapted from Week 9 Tutorial
    n_hidden = 32
    n_instances = len(df)
    n_val = 5000
    learning_rate = 0.005
    no_catgories = 2
    
    rnn = RNNClassifier(n_val, n_hidden, no_categories)
    criterion = nn.NLLLoss()
    hidden = rnn.initHidden()
    rnn.zero_grad()
    
    for i in range(lst_tensor.size()[0]):
        output, hidden = rnn(lst_tensor[i], hidden)
    loss = criterion(output, category_tensor)
    loss.backward()
    
    #Add parameters' gradients to their values, multiplied by the learning rate.
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)
        
    return output, loss.item()


def run_training_function(df, n_instances=5000):
    
    # Set up copy and construct the label for RNN training.
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)   
    df = df.drop(['target authors'], axis=1)
    
    #Trains the RNN function: adapted from Week 9 Tutorial.
    
    print_every=1000
    correct = 0
    #set n_iters as 5000 first.
    
    for i in range(n_instances):
        
        category, lst, category_tensor, lst_tensor = constructSample(df, i)
        
        ##Then start the training process.
        output, loss = train_RNN_classifier(category_tensor, lst_tensor)
        current_loss += loss
        
        #Get the top category. The "CategoryFromOutput" function from the Week 9 lab is included here
        top_1, top_ind = output.topk(1)
        category_index = top_ind[0].item
        ##make a guess
        guess, guess_i = all_categories.index[category_index], category_index
            
        if guess == category: 
            correct += 1
            
            #Print every:
            if i % print_every == 0: 
                print('%d %s %s' % (i, guess, category))
    #Examine the accuracy.
    print("Accuracy is: %.4f" %(correct/n_instances))
            

def train_classifier(author: int, df: pd.DataFrame, debug=False):
    """
    Trains a classifier for author i. Assumes text-vectorisaiton has occured.
    
    Model Features:
    text vectorisation
    """
    # create copy and set up label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    df = df.drop(['target authors', 'text', 'abstract', 'title'], axis=1)  ##This provides the input for the category tensor!
    #############ideally now drop text, abstract and text 
    
    # split up positive and negative instances so as to ensure a balanced training set 
    # if we don't do this, we end up with a very imbalanced training set 
    # however, if we don't include enough negative samples, we tend to "overclassify". 
    # we can tune out performance with the below 'neg sample factor'
    
    neg_sample_factor = 10
    
    pos = df[df['label'] == 1] 
    neg = df[df['label'] == 0]
    
    n_pos_samples = pos.shape[0]
    n_tot_samples = df.shape[0]
    
    # takes a sample of the negative instances to train on
    neg = neg.sample(frac=neg_sample_factor*(n_pos_samples/n_tot_samples), random_state=51) 
    
    if debug:
        print(f"training on {pos.shape[0]} postitive instances")
        print(f"training on {neg.shape[0]} negative  instances")
    
    df = pd.concat([pos, neg])
    X_train = df.loc[:, df.columns != "label"]
    y_train = df["label"]
    
    if debug:
        print(f"training on {X_train.shape[0]} instances")
    
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    
    if debug:
        y_train_pred = clf.predict(X_train) 
        acc = accuracy_score(y_train, y_train_pred) 
        f1  = f1_score(y_train, y_train_pred)
        print(f"Accuracy: {acc}")
        print(f"f1 score: {f1}")

    return clf

In [22]:
#Model validation output to file

In [23]:
#Model validation
def validate_model(author: int, df: pd.DataFrame, classifier):
    # simple function to assess model performance
    
    # create copy and set up label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)   
    df = df.drop(['target authors'], axis=1)
    
    # split up positive and negative instances so as to ensure a balanced training set 
    # if we don't do this, we end up with a very imbalanced training set 
    pos = df[df['label'] == 1] 
    neg = df[df['label'] == 0]
    
    # takes a sample of the instances to test on
    pos = pos.sample(frac=(1/2))
    neg = neg.sample(frac=(1/10))
    
    # recombine 
    df = pd.concat([pos, neg])
    X_test = df.loc[:, df.columns != "label"]
    y_test = df["label"]
    
    # perform predictions (from text only)
    y_pred = classifier.predict(X_test)
    
    acc = accuracy_score(y_pred, y_test) 
    f1  = f1_score(y_pred, y_test)#, average="samples") 
    ##average="samples" is used by the leaderboard (from discussion question #184) but ValueError: Samplewise metrics are not available outside of multilabel classification. 
    #print(f"Model {author}")
    #print(f"Accuracy: {acc}")
    #print(f"f1 score: {f1}")
    return [acc, f1]

In [11]:
def write_validation_performance(file_name):
    """
    Function for writing validations to output file.
    WARNING: Deletes file_name.csv if already present in working directory.
    """
    if os.path.exists(file_name):
        os.remove(file_name)
        print("removed previous validations")
    
    with open(file_name, mode='w') as f0:
        write_valid = csv.writer(f0)
        header = ['Id', 'Accuracy', 'F1']
        write_valid.writerow(header)
        
        #Loop over each author.
        authors = np.arange(0, 100)
        models  = []
        sum_accuracy = 0
        sum_F1 = 0
        
        for i in tqdm(authors):
            model = train_classifier(i, df, debug=False)
            models.append(model)
            #Check for performance. Separate function.
            return_val = validate_model(i, df, model)
            sum_F1 += return_val[1]
            sum_accuracy += return_val[0]
            write_valid.writerow([i, return_val[0], return_val[1]])
            
        write_valid.writerow(["aver", sum_accuracy/len(authors), sum_F1/len(authors)])
        print(f"Accuracy: {sum_accuracy/len(authors)}")
        print(f"f1 score: {sum_F1/len(authors)}")

    return models


In [12]:
def make_predictions(test_df: pd.DataFrame, models):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("predictions.csv"):
        os.remove("predictions.csv")
        print("removed previous predictions")
    
    
    with open("predictions.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predicted']
        writer.writerow(header)
        
        
        X_test = test_df 
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            x   = np.array(X_test.iloc[Id]).reshape(1, -1)
            row = [Id]
            authors = []
            #Now iterate through each of the models.
            for author, model in enumerate(models):
                if np.array(model.predict(x)).item() == 1:
                    authors.append(author)

            # to match the output requirement 
            if len(authors) == 0: row.append(-1)
            else: row += authors
            
            writer.writerow(row)
    return

In [18]:
#Read into a data frame
data_f = pd.read_json("train.json")

#Training and validating data
path = "train.json"
df = load_data_set(path)
df = pre_processing(df)

#file_name = "validation.csv"
#models = write_validation_performance(file_name)

#Testing RNN:
run_training_function(df)

#Building Predictions
#path = "test.json"
#df_test = load_data_set(path)
#df_test = pre_processing(df_test, train=False)

#make_predictions(df_test, models)