In [1]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from typing import List

NUM_WORDS = 5001
PADDING   = 5000    # special char set as padding
NUM_AUTHORS = 21246
MAX_LEN = 250
RANDOM_STATE = 42069

In [2]:
def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)
    return data

In [3]:
path = "../data/train.json"
train = load_data_set(path)
train.head()

loaded 25793 instances


Unnamed: 0,authors,year,abstract,venue,title
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20.0,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2.0,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
2,"[19166, 17763]",17,"[40, 1542, 1691, 2449, 1535, 2610, 1543, 1535,...",,"[2085, 1719, 1846, 1745, 2243, 1553, 1606, 159..."
3,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4.0,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9.0,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."


## Preprocess

In [4]:
def preprocess(df: pd.DataFrame, train=True, drop_samples=False):
    
    df = df.copy(deep=True)
   
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    # drops samples containing no prolific authors, Reduces training set by ~60% to 7000 samples
    if drop_samples:
        df["has target"] = df["target authors"].apply(lambda x: len(x)>0)
        df = df[df["has target"] == True]
        df = df.drop(["has target"], axis=1)
        
    # text transormation
    df["text"] = df["title"] + df["abstract"]
    df["text"] = df["text"].apply(lambda x: pad_sequence(x, MAX_LEN))
    
    # drop unnecessarily long examples
    if train:
        df["len"] = df["text"].apply(lambda x: len(x))
        df = df[df["len"]<=MAX_LEN]

    # drop
    df = df.drop(["abstract", "title", "year", "coauthors", "venue", "len"], axis=1)
    return df

In [5]:
# feature transformation

def filter_authors(authors: List[int], prolifics=True):
    """
    filters authors between prolific and coauthors
    """
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)
    
    
def pad_sequence(sequence: List, max_len: int):
    return sequence + [PADDING] * (max_len - len(sequence))

In [6]:
df = preprocess(train, drop_samples=True)
df.head()

Unnamed: 0,target authors,text
0,"[42, 36]","[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1..."
1,[45],"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3..."
3,[97],"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5..."
4,[2],"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,..."
9,"[44, 2]","[1560, 1694, 11, 1546, 11, 3066, 1728, 47, 160..."


In [7]:
df.shape

(6690, 2)

## Training - Validation Split

In [8]:
def training_eval_split(author: int, df: pd.DataFrame, resampling=None):
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    X, y = df["text"], df["label"]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)
    
    if resampling:
        X_train, y_train = resampling(X_train, y_train)
        X_train, y_train = X_train.to_numpy(), y_train.to_numpy()
    
    
    return X_train, X_val, y_train, y_val

In [9]:
# creating training - test split for a single author for prototyping
test_auth = 69
X_train, X_val, y_train, y_val = training_eval_split(test_auth, df)

In [10]:
X_val

1739     [37, 1554, 2461, 1556, 4129, 1544, 2625, 1787,...
1101     [11, 3717, 1623, 1575, 1542, 2125, 1547, 2174,...
9351     [46, 1670, 1547, 1549, 4905, 1569, 1820, 1543,...
681      [4284, 3231, 2445, 1553, 2258, 11, 1669, 2771,...
24205    [1560, 40, 2148, 11, 1994, 24, 3228, 1549, 425...
                               ...                        
2130     [47, 1603, 1538, 3591, 1533, 1780, 1708, 1539,...
6782     [2600, 3057, 1546, 2590, 1608, 11, 3580, 3929,...
369      [1779, 1660, 1747, 34, 24, 47, 1751, 4326, 155...
17249    [1542, 2667, 1854, 3591, 1747, 1671, 1575, 160...
22618    [40, 2211, 37, 1596, 1567, 1541, 1708, 1704, 1...
Name: text, Length: 2007, dtype: object

In [11]:
X_train

3128     [46, 1574, 1660, 1661, 2796, 1547, 1543, 1830,...
11800    [3390, 11, 1564, 1661, 2193, 41, 3949, 1745, 5...
14967    [11, 37, 1740, 1623, 1621, 50, 1620, 1632, 57,...
14553    [48, 1559, 2379, 1826, 33, 37, 2085, 11, 57, 4...
5338     [11, 37, 3516, 24, 57, 46, 1899, 1567, 46, 167...
                               ...                        
1342     [38, 2105, 1551, 37, 1766, 1553, 46, 2941, 50,...
9814     [31, 11, 24, 37, 39, 3015, 1553, 2082, 1545, 1...
18813    [1539, 1551, 3485, 1573, 1715, 1553, 47, 2324,...
14540    [47, 1806, 46, 1991, 1553, 54, 1551, 41, 1570,...
5369     [1784, 1560, 2446, 1527, 11, 1560, 2197, 1538,...
Name: text, Length: 4683, dtype: object

## Word Embedding

This model is based off of the post at: https://medium.com/analytics-vidhya/multiclass-text-classification-using-deep-learning-f25b4b1010e5
within, the author uses `GloVE` word-embedding which we replicate here. 

This matrix is used as an embedding matrix for our neural networks to act as a sort of feature normalisation

In [12]:
def load_glove(word_index):
    EMBEDDING_FILE = '../GloVE/glove.840B.300d.txt'

    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]

    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = -0.005838499, 0.48782197
    embed_size = all_embs.shape[1]
    nb_words = min(MAX_FEATURES, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in tqdm(word_index.items()):
        if i >= MAX_FEATURES: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


word_index = {}
for i in range(5001):
    word_index[i] = i

MAX_FEATURES = NUM_WORDS
embedding_matrix = load_glove(word_index)

  exec(code_obj, self.user_global_ns, self.user_ns)
100%|██████████| 5001/5001 [00:00<00:00, 120404.07it/s]


In [13]:
embedding_matrix.shape

(5001, 300)

## Deep Learning Models

In [14]:
class CNN_Text(nn.Module):    
    def __init__(self):
        super(CNN_Text, self).__init__()
        filter_sizes = [1,2,3,5]
        num_filters = 36
        n_classes = len(le.classes_)
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(len(filter_sizes)*num_filters, n_classes)
def forward(self, x):
        x = self.embedding(x)  
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = torch.cat(x, 1)
        x = self.dropout(x)  
        logit = self.fc1(x)
        return logit

In [15]:
class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.hidden_size = 64
        drp = 0.1
        n_classes = 2
        self.embedding = nn.Embedding(NUM_WORDS, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_size, self.hidden_size, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(self.hidden_size*4 , 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drp)
        self.out = nn.Linear(64, n_classes)

    def forward(self, x):
        h_embedding = self.embedding(x)
        h_lstm, _ = self.lstm(h_embedding)
        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

## Training & Fitting

In [26]:
import time
n_epochs = 6
batch_size = 50

embed_size = 300
model = BiLSTM()
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)


# convert series to lists
X_val_list = X_val.to_list()
y_val_list = y_val.to_list()

# Load train and test in CUDA Memory
x_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
x_cv = torch.tensor(X_val_list, dtype=torch.long)
y_cv = torch.tensor(y_val_list, dtype=torch.long)

# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_cv, y_cv)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
train_loss = []
valid_loss = []
for epoch in range(n_epochs):
    start_time = time.time()
    # Set model to train configuration
    model.train()
    avg_loss = 0.  
    for i, (x_batch, y_batch) in tqdm(enumerate(train_loader)):
        # Predict/Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    
    # Set model to validation configuration - Doesn't get trained here
    model.eval()        
    avg_val_loss = 0.
    val_preds = np.zeros((len(x_cv), 2))
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred, dim=1).cpu().numpy()
    
    # Check Accuracy
    preds = val_preds.argmax(axis=1)
    val_accuracy = sum(preds==y_val)/len(y_val)
    f1 = f1_score(y_val, preds)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    elapsed_time = time.time() - start_time
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f} \t f1 = {:.4f}  \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy, f1, elapsed_time))

  y_train = torch.tensor(y_train, dtype=torch.long)
94it [00:22,  4.10it/s]
0it [00:00, ?it/s]

Epoch 1/6 	 loss=4.9593 	 val_loss=3.7576  	 val_acc=0.9856 	 f1 = 0.0000  	 time=28.40s


94it [00:22,  4.09it/s]
0it [00:00, ?it/s]

Epoch 2/6 	 loss=2.9254 	 val_loss=3.7078  	 val_acc=0.9856 	 f1 = 0.0000  	 time=28.39s


94it [00:22,  4.09it/s]
0it [00:00, ?it/s]

Epoch 3/6 	 loss=2.8459 	 val_loss=3.7578  	 val_acc=0.9856 	 f1 = 0.0000  	 time=28.39s


94it [00:23,  4.07it/s]
0it [00:00, ?it/s]

Epoch 4/6 	 loss=2.7472 	 val_loss=3.4324  	 val_acc=0.9856 	 f1 = 0.0000  	 time=28.48s


94it [00:23,  4.06it/s]
0it [00:00, ?it/s]

Epoch 5/6 	 loss=2.3743 	 val_loss=3.6464  	 val_acc=0.9856 	 f1 = 0.0000  	 time=28.56s


94it [00:23,  4.06it/s]


Epoch 6/6 	 loss=1.8547 	 val_loss=2.7985  	 val_acc=0.9856 	 f1 = 0.0000  	 time=28.57s
