In [2]:
import pandas as pd
import numpy as np
import re
import string
import gensim
import time
import json


import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

import multiprocessing as mp
from spellchecker import SpellChecker

import torch
import torch.nn as nn
import torch.optim as optim
import math
from argparse import Namespace

from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm_notebook
from tqdm.notebook import tqdm

# Init

In [3]:
if not nltk.find('corpora/wordnet'):
     nltk.download('wordnet')
porter_stemmer  = PorterStemmer()
lemmatizer      = WordNetLemmatizer()
regex_tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
spell  = SpellChecker()
one_hot_vectorizer = CountVectorizer(binary=True)

args = Namespace(
    # Data and Path hyper parameters
    degree_injury_file="./data/degreeinjury.csv",
    injury_bodyparts_file="./data/injurybodyparts.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.h5",
    save_dir="model_storage/document_classification",
    # Model hyper parameters
    glove_filepath='./Glove/glove.6B.200d.txt', 
    use_glove=True,
    embedding_size=150, 
    # Training hyper parameter
    window_size=5,
    val_proportion=0.1,
    test_proportion=0.2,
    learning_rate = 0.001,
    seed=666,
    dropout_p=0.1, 
    batch_size=256, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

## Functions

In [4]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

def make_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i

    return final_embeddings, embedding_size

def load_glove_from_file(glove_filepath):
    word_to_index = {}
    embeddings = []
    with open(glove_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def clean_text(text):
    txt = ''
    txt = txt.strip()
    txt = re.sub(r"([.,!?])", r" \1 ", txt)
    for word in tokenize(text):
        # The EE is short hand on employee
        if word=='ee' or word == 'EE':
            txt += ' employee '
        elif word in string.punctuation:
            txt = txt + ' ' +word
        elif len(word)>2:
            word = spell.correction(word)
            txt = txt + ' ' + word.lower().strip()
    txt = re.sub(r"[^a-zA-Z.,!?]+", r" ", txt)
    return txt.strip()

def tokenize(text):
    return nltk.word_tokenize(text)

def stem(words, df=False):
    stemmed_words=[porter_stemmer.stem(word) for word in words]
    if df:
        return pd.DataFrame({'original': words,'stemmed': stemmed_words})
    return stemmed_words

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return None

def lemmatize(words, df=False):
    lemmatized_words=[]
    tagged_sent = nltk.pos_tag(words)
    
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or nltk.corpus.wordnet.NOUN
        if wordnet_pos is None:
            lemmatized_words.append(lemmatizer.lemmatize(tag[0]))
        else:
            lemmatized_words.append(lemmatizer.lemmatize(tag[0], pos=wordnet_pos))
    
    if df:
        return pd.DataFrame({'original': words,'lemmatized': lemmatized_words})
    return lemmatized_words

def SK_TFIDF_stopwords(corpus, vectorizer):
    vectorizer.fit(corpus)
    X = vectorizer.transform(corpus)
    return vectorizer.stop_words_, X

def get_words(texts):
    words = []
    for text in texts:
        words+=regex_tokenizer.tokenize(text)
    return np.asarray(words).reshape(-1, 1)

def OHE_2D(sent, sparse=False):
    enc = OneHotEncoder(handle_unknown='ignore', sparse=sparse)
    words=[]
    for w in sent.values:
        words += w
    X = enc.fit(np.array(words).reshape(-1,1))
    
    # Round up to without unit digit
    ohe = sent.apply(lambda x: enc.transform(np.array(x).reshape(-1,1)))
    maxwords = max([len(x) for x in ohe])
    maxwords = math.ceil(maxwords/10)*10
    dim = len(ohe[0][0])
    print('Vector length:          ', dim, '\nMaximum number of words:', maxwords)

    return [np.concatenate((x, np.zeros((maxwords - len(x), dim))), axis=0) for x in ohe]

def OHE_1D(sent):
    corpus = [' '.join(x) for x in sent.values]
    print(corpus[0])
    ohe = CountVectorizer(binary=True)
    return pd.DataFrame(ohe.fit_transform(corpus).todense(), columns=ohe.get_feature_names())

def spell_check(words):
    return [spell.correction(word) for word in words]

def remove_stopwords(words, stopwords):
    return [w for w in words if w not in stopwords]

def split_save(data, path, test=0.2, valid= 0.1):
    train, test  = train_test_split(data,  test_size=test, random_state=666, shuffle=True, stratify=data['target'])
    train, valid = train_test_split(train, test_size=valid, random_state=666, shuffle=True, stratify=train['target'])
    train['split'] = 'train'
    test['split']  = 'test'
    valid['split'] = 'val'
    pd.concat([train, valid, test]).to_csv(path, index=False)

# Clean Narrative
#### Clean Narrative is computionally expensive run if only necessary

In [5]:
df = pd.read_csv('us_data_2000.csv')
df = df[['DEGREE_INJURY', 'DEGREE_INJURY_CD', 'INJ_BODY_PART_CD', 'INJ_BODY_PART', 'NARRATIVE']]
df.head(1)

Unnamed: 0,DEGREE_INJURY,DEGREE_INJURY_CD,INJ_BODY_PART_CD,INJ_BODY_PART,NARRATIVE
0,DAYS RESTRICTED ACTIVITY ONLY,5,700,MULTIPLE PARTS (MORE THAN ONE MAJOR),Employee was cleaning up at the Primary Crushe...


In [6]:
df.isnull().any()

DEGREE_INJURY       False
DEGREE_INJURY_CD    False
INJ_BODY_PART_CD    False
INJ_BODY_PART       False
NARRATIVE           False
dtype: bool

In [7]:
print('NARRATIVE NaN count:   ', df[df['NARRATIVE'].isna()].shape[0])
print('NARRATIVE len>10 count:', df[df['NARRATIVE'].str.len() < 10].shape[0])
print('\nDEGREE_INJURY_CD "?" count: ', df[df['DEGREE_INJURY_CD'] == '?'].shape[0])
print('\nINJ_BODY_PART_CD "?" count: ', df[df['INJ_BODY_PART_CD'] == '?'].shape[0])

NARRATIVE NaN count:    0
NARRATIVE len>10 count: 1

DEGREE_INJURY_CD "?" count:  11

INJ_BODY_PART_CD "?" count:  242


#### Remove Narrative are too short and NA

In [8]:
print("Origin data size:      ", df.shape[0])
df = df[df['NARRATIVE'].notna()]
df = df[df['NARRATIVE'].str.len() > 10]
print("Clean data size:", df.shape[0])

Origin data size:       2000
Clean data size: 1999


### To Lower case and correct words

In [9]:
with mp.Pool(mp.cpu_count()) as pool:
    df['NARRATIVE'] = pool.map(clean_text, df['NARRATIVE'])
df['NARRATIVE'].values[0]

'employee was cleaning the primary crusher with the dingo skid steer . the employee slipped and fell while operating the skid steer and the machine pinned him against the cement retaining wall .'

### Dataset for task 1

In [10]:
df_degree=df[['NARRATIVE', 'DEGREE_INJURY_CD', 'DEGREE_INJURY']].copy()
df_degree = df_degree[df_degree['DEGREE_INJURY_CD'] != '?']
df_degree['DEGREE_INJURY_CD'] = pd.to_numeric(df_degree['DEGREE_INJURY_CD'])

def get_degree_injury(dataframe):
    CD = []
    INJURY = []
    for i in range(0, len(dataframe.DEGREE_INJURY_CD.unique())):
        CD.append(i)
        INJURY.append(dataframe[dataframe['DEGREE_INJURY_CD'] == i]['DEGREE_INJURY'].values[0])
    return pd.DataFrame({'DEGREE_INJURY_CD': CD,'DEGREE_INJURY': INJURY})

get_degree_injury(df_degree)

Unnamed: 0,DEGREE_INJURY_CD,DEGREE_INJURY
0,0,ACCIDENT ONLY
1,1,FATALITY
2,2,PERM TOT OR PERM PRTL DISABLTY
3,3,DAYS AWAY FROM WORK ONLY
4,4,DYS AWY FRM WRK & RESTRCTD ACT
5,5,DAYS RESTRICTED ACTIVITY ONLY
6,6,"NO DYS AWY FRM WRK,NO RSTR ACT"
7,7,OCCUPATNAL ILLNESS NOT DEG 1-6
8,8,INJURIES DUE TO NATURAL CAUSES
9,9,INJURIES INVOLVNG NONEMPLOYEES


In [11]:
100*df_degree['DEGREE_INJURY'].value_counts(normalize=True)

DAYS AWAY FROM WORK ONLY          29.879276
NO DYS AWY FRM WRK,NO RSTR ACT    27.766600
DAYS RESTRICTED ACTIVITY ONLY     18.058350
ACCIDENT ONLY                     11.016097
DYS AWY FRM WRK & RESTRCTD ACT     7.293763
OCCUPATNAL ILLNESS NOT DEG 1-6     2.867203
ALL OTHER CASES (INCL 1ST AID)     1.006036
PERM TOT OR PERM PRTL DISABLTY     0.905433
FATALITY                           0.553320
INJURIES DUE TO NATURAL CAUSES     0.503018
INJURIES INVOLVNG NONEMPLOYEES     0.150905
Name: DEGREE_INJURY, dtype: float64

In [12]:
group1 = [
    'DAYS AWAY FROM WORK ONLY',
    'DAYS RESTRICTED ACTIVITY ONLY',
    'DYS AWY FRM WRK & RESTRCTD ACT'
]
df_degree['target'] = df_degree['DEGREE_INJURY'].isin(group1)
df_degree["target"] = df_degree["target"].astype(str)
del df_degree['DEGREE_INJURY']
del df_degree['DEGREE_INJURY_CD']
split_save(df_degree, path=args.degree_injury_file)

### Dataset for task 2

In [18]:
HEAD     = ['SKULL', 'EAR(S) INTERNAL & HEARING', 'EAR(S) EXTERNAL', 'EAR(S) INTERNAL & EXTERNAL' ,'EYE(S) OPTIC NERVE/VISON', 'NOSE/NASAL PASSAGES/SINUS/SMELL', 'EAR(S) INTERNAL & HEARING', 'BRAIN', 'FACE,NEC', 'FACE, MULTIPLE PARTS', 'HEAD, MULTIPLE PARTS', 'HEAD,NEC', 'MULTIPLE PARTS', 'MOUTH/LIP/TEETH/TONGUE/THROAT/TASTE', 'JAW INCLUDE CHIN', 'SCALP', 'NECK']
LEG      = ['LEG, MULTIPLE PARTS', 'LOWER LEG/TIBIA/FIBULA', 'FOOT(NOT ANKLE/TOE)/TARSUS/METATARSUS', 'LEG, NEC', 'KNEE/PATELLA', 'TOE(S)/PHALANGES', 'ANKLE', 'THIGH/FEMUR', 'LOWER EXTREMITIES,NEC', 'LOWER EXTREMITIES, MULTIPLE PARTS']
ARM      = ['ARM, MULTIPLE PARTS', 'ARM,NEC', 'HAND (NOT WRIST OR FINGERS)', 'WRIST', 'FINGER(S)/THUMB', 'UPPER ARM/HUMERUS', 'SHOULDERS (COLLARBONE/CLAVICLE/SCAPULA)', 'ELBOW', 'FOREARM/ULNAR/RADIUS', 'UPPER EXTREMITIES, MULTIPLE', 'UPPER EXTREMITIES, NEC']
BODY     = ['CHEST (RIBS/BREAST BONE/CHEST ORGNS)', 'BODY SYSTEMS', 'ABDOMEN/INTERNAL ORGANS', 'BODY PARTS, NEC', 'TRUNK, MULTIPLE PARTS', 'TRUNK,NEC', 'HIPS (PELVIS/ORGANS/KIDNEYS/BUTTOCKS)', 'BACK (MUSCLES/SPINE/S-CORD/TAILBONE)']
MULTIPLE = ['MULTIPLE PARTS (MORE THAN ONE MAJOR)']

df_parts=df[['NARRATIVE', 'INJ_BODY_PART_CD', 'INJ_BODY_PART']].copy()
df_parts = df_parts[df_parts['INJ_BODY_PART_CD'] != '?']
del df_parts['INJ_BODY_PART_CD']

df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(HEAD)]     = 'Head'
df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(LEG)]      = 'Leg'
df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(ARM)]      = 'Arm'
df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(BODY)]     = 'Body'
df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(MULTIPLE)] = 'Multiple Parts'
df_parts['target'] = df_parts['INJ_BODY_PART']
df_parts = df_parts[df_parts['target']!='UNCLASSIFIED']
del df_parts['INJ_BODY_PART']
split_save(df_parts, path=args.injury_bodyparts_file)
print("classes", set(df_parts.target))
print("Instances", df_parts.shape[0])

classes {'Multiple Parts', 'Leg', 'Head', 'Arm', 'Body'}
Instances 1755


### Dataset for task 3

In [19]:
# TODO

## Data Prepreocessing and Preparation 

#### Test lem and stem

In [20]:
df_stem      = stem(nltk.tokenize.word_tokenize((df['NARRATIVE'].values[0])), df=True)
df_lemmatize = lemmatize(nltk.tokenize.word_tokenize((df['NARRATIVE'].values[0])), df=True)
df_stem['lemmatized'] = df_lemmatize['lemmatized']

In [21]:
df_stem[df_stem['stemmed'] != df_stem['lemmatized']]

Unnamed: 0,original,stemmed,lemmatized
0,employee,employe,employee
1,was,wa,be
4,primary,primari,primary
13,employee,employe,employee
18,operating,oper,operate
24,machine,machin,machine


### Test Stopwords

In [22]:
vectorizer = TfidfVectorizer(max_df=0.1, min_df=1)
_df = df[["NARRATIVE"]].copy()
_df['NARRATIVE'] = _df['NARRATIVE'].apply(lambda x: tokenize(x))
_df['NARRATIVE'] = _df['NARRATIVE'].apply(lambda x: lemmatize(x))
# _df['NARRATIVE'] = _df['NARRATIVE'].apply(lambda x: stem(x))

stopwrods, X = SK_TFIDF_stopwords(_df['NARRATIVE'].apply(lambda x: ' '.join(x)), vectorizer)

terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
sums = X.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append( (term, sums[0,col] ))
print('Number of stopwords:', len(stopwrods))
print('Number of terms    :', len(terms))

ranking = pd.DataFrame(data, columns=['term','rank'])
print('Ascending')
print(ranking.sort_values('rank', ascending=True)[0:15])

print('\nDescending')
print(ranking.sort_values('rank', ascending=False)[0:15])

Number of stopwords: 27
Number of terms    : 3511
Ascending
               term      rank
2150  perpendicular  0.142870
943       elevation  0.142870
225            baby  0.148709
3035       swelling  0.148709
39      acupuncture  0.148709
3403             wd  0.152215
3295    unknowingly  0.152215
3183            tow  0.158576
144      applicator  0.158644
2737        silicon  0.158644
2115    participate  0.158733
1017       exercise  0.158733
3157             to  0.162485
1469            ind  0.162485
2453     relocating  0.167219

Descending
       term       rank
2550   rock  43.301234
343    bolt  42.370124
2917   step  36.435148
2173  piece  34.772444
1152   foot  34.292901
2096   pain  33.907957
3231  truck  32.252625
1080   felt  31.555692
1376    hit  30.986592
289    belt  30.728900
1614   knee  29.765978
859    down  29.416297
1235    get  29.267398
1369    him  29.053929
1535   into  28.936520




In [23]:
# Add words that could be meaningless
for w in ['abb','abc','dia', 'yes','yee', 'that', 'ind', 'wd', 'to', 'him']:
        stopwrods.add(w)

In [24]:
print(stopwrods)

{'back', 'out', 'when', 'left', 'abb', 'that', 'roof', 'fell', 'his', 'off', 'ind', 'cause', 'with', 'work', 'not', 'right', 'have', 'yee', 'be', 'finger', 'him', 'yes', 'from', 'cut', 'hand', 'abc', 'the', 'slip', 'fall', 'strike', 'dia', 'wd', 'to', 'while', 'and', 'employee'}


# Function repeative Prepreocessing and Preparation

In [25]:
def tok_lem_sem(text):
    words = tokenize(text)
    words = lemmatize(words)
#     words = stem(words)
    return words

def removestop_join(words, stopwords):
    words = remove_stopwords(words, stopwords)
    return ' '.join(words)

def prepreocessing(data, stop, target="NARRATIVE"):
    _df=data.copy()
    with mp.Pool(mp.cpu_count()) as pool:
        _df[target] = pool.map(tok_lem_sem, _df[target])
    _df[target] = _df.apply(lambda row: removestop_join(row[target], stop), axis=1)
    return _df

In [26]:
prepreocessing(pd.read_csv(args.degree_injury_file), stopwrods).to_csv(args.degree_injury_file, index=False)

In [27]:
prepreocessing(pd.read_csv(args.injury_bodyparts_file), stopwrods).to_csv(args.injury_bodyparts_file, index=False)

# Binary Document Classification

## One Hoe Encoding

In [5]:
class OHEVocabulary(object):
    """Class to process text and extract Vocabulary for mapping"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                                for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = 1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx, 'add_unk': self._add_unk, 'unk_token': self._unk_token}
    @classmethod
    
    def from_serializable(cls, contents):
        return cls(**contents)
    
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

class OHESequenceVocabulary(OHEVocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(OHESequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(OHESequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

class OHEVectorizer(object):
    def __init__(self, NARRATIVE_vocab, target_vocab):
        self.NARRATIVE_vocab = NARRATIVE_vocab
        self.target_vocab = target_vocab

    def vectorize(self, NARRATIVE):
        one_hot = np.zeros(len(self.NARRATIVE_vocab), dtype=np.float32)
        for token in NARRATIVE.split(" "):
            if token not in string.punctuation:
                one_hot[self.NARRATIVE_vocab.lookup_token(token)] = 1
        return one_hot

    @classmethod
    def from_dataframe(cls, df, cutoff=15):
        target_vocab = OHEVocabulary()        
        for target in sorted(set(df.target)):
            target_vocab.add_token(target)

        word_counts = Counter()
        for NARRATIVE in df.NARRATIVE:
            for token in NARRATIVE.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        NARRATIVE_vocab = OHESequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                NARRATIVE_vocab.add_token(word)
        
        return cls(NARRATIVE_vocab, target_vocab)

    @classmethod
    def from_serializable(cls, contents):
        NARRATIVE_vocab =   OHESequenceVocabulary.from_serializable(contents['NARRATIVE_vocab'])
        target_vocab =  OHEVocabulary.from_serializable(contents['target_vocab'])

        return cls(NARRATIVE_vocab=NARRATIVE_vocab, target_vocab=target_vocab)

    def to_serializable(self):
        return {'NARRATIVE_vocab': self.NARRATIVE_vocab.to_serializable(),
                'target_vocab': self.target_vocab.to_serializable()}

class OHEDataset(Dataset):
    def __init__(self, df, vectorizer):
        self.df = df
        self._vectorizer = vectorizer
        
        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, df.NARRATIVE)) + 2
        
        self.train_df = self.df[self.df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.df[self.df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.df[self.df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),'val': (self.val_df, self.validation_size),'test': (self.test_df, self.test_size)}
        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, csv):
        df = pd.read_csv(csv)
        return cls(df, OHEVectorizer.from_dataframe(df))
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        NARRATIVE_vector = self._vectorizer.vectorize(row.NARRATIVE)
        target_index = self._vectorizer.target_vocab.lookup_token(row.target)-1
        return {'x_data': NARRATIVE_vector,'y_target': target_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

class OHEClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim=1):
        super(OHEClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dims[0])
        self.relu1 = nn.ReLU()
        
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.relu2 = nn.ReLU()
        
        self.fc3 = nn.Linear(hidden_dims[1], hidden_dims[2])
        self.relu3 = nn.ReLU()
        
        self.fc4 = nn.Linear(hidden_dims[2], output_dim) 
    
    def forward(self, x, apply_sigmoid=False):
        out = self.fc1(x)
        out = self.relu1(out)
        
        out = self.fc2(out)
        out = self.relu2(out)
        
        out = self.fc3(out)
        out = self.relu3(out)
        
        out = self.fc4(out)
        
        if apply_sigmoid:
            out = torch.sigmoid(out)
        return out.squeeze()

In [6]:
train_state = make_train_state(args)
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

# dataset and vectorizer
dataset = OHEDataset.load_dataset_and_make_vectorizer(args.degree_injury_file)
vectorizer = dataset.get_vectorizer()
# model
classifier = OHEClassifier(input_dim=len(vectorizer.NARRATIVE_vocab), hidden_dims=[256, 256, 128])
classifier = classifier.to(args.device)
# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.0001)
train_state = make_train_state(args)
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

In [7]:
len(vectorizer.NARRATIVE_vocab)

500

In [8]:
best_val  = {
    'epoch': 0,
    'model_state_dict': 0,
    'optimizer_state_dict': 0,
    'loss': 0,
    'acc':0
}
for epoch_index in range(150):
    train_state['epoch_index'] = epoch_index
    # Iterate over training dataset
    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is 5 steps:
        # step 1. zero the gradients
        optimizer.zero_grad()
        # step 2. compute the output
        y_pred = classifier(batch_dict['x_data'].float())
        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss) / (batch_index + 1)
        # step 4. use loss to produce gradients
        loss.backward()
        # step 5. use optimizer to take gradient step
        optimizer.step()
        # compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        # step 1. compute the output
        y_pred = classifier(batch_dict['x_data'].float())
        # step 2. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        # step 3. compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    if(best_val['acc'] < running_acc):
        best_val  = {
            'epoch': epoch_index,
            'model_state_dict': classifier.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': running_loss,
            'acc' : running_acc
        }
    if epoch_index%5 == 0:
        print('Train loss', round(train_state['train_loss'][-1], 4), 'Train acc', round(train_state['train_acc'][-1], 4), 'Val loss', round(train_state['val_loss'][-1], 4), 'Val acc', round(train_state['val_acc'][-1], 4))
torch.save(best_val, "OHE.h5")

Train loss 0.6902 Train acc 55.2061 Val loss 0.6897 Val acc 55.3459
Train loss 0.6888 Train acc 55.2061 Val loss 0.6885 Val acc 55.3459
Train loss 0.6874 Train acc 55.2061 Val loss 0.6873 Val acc 55.3459
Train loss 0.6858 Train acc 55.2061 Val loss 0.6861 Val acc 55.3459
Train loss 0.6837 Train acc 55.2061 Val loss 0.6845 Val acc 55.3459
Train loss 0.6811 Train acc 55.276 Val loss 0.6825 Val acc 55.3459
Train loss 0.6775 Train acc 55.7652 Val loss 0.68 Val acc 55.3459
Train loss 0.6727 Train acc 57.2327 Val loss 0.6767 Val acc 57.8616
Train loss 0.6663 Train acc 59.399 Val loss 0.6723 Val acc 57.8616
Train loss 0.6579 Train acc 60.9364 Val loss 0.6667 Val acc 59.1195
Train loss 0.6471 Train acc 63.8714 Val loss 0.6595 Val acc 62.2642
Train loss 0.6334 Train acc 68.4137 Val loss 0.6507 Val acc 61.0063
Train loss 0.6164 Train acc 71.6981 Val loss 0.64 Val acc 61.0063
Train loss 0.596 Train acc 76.1705 Val loss 0.6274 Val acc 62.8931
Train loss 0.572 Train acc 78.5465 Val loss 0.6129 Val 

In [9]:
dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=len(dataset),device=args.device)
running_loss = 0.
running_acc = 0.
checkpoint = torch.load("OHE.h5")
classifier.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x=batch_dict['x_data'].float())
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [10]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))

Test loss: 0.662
Test Accuracy: 73.12


## Word embedding 

In [11]:
class Vocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)
    
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

class GloveVectorizer(object):  
    def __init__(self, NARRATIVE_vocab, target_vocab):
        self.NARRATIVE_vocab = NARRATIVE_vocab
        self.target_vocab = target_vocab

    def vectorize(self, NARRATIVE, vector_length=-1):
        indices = [self.NARRATIVE_vocab.begin_seq_index]
        indices.extend(self.NARRATIVE_vocab.lookup_token(token) for token in NARRATIVE.split(" ") if token not in string.punctuation)
        indices.append(self.NARRATIVE_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.NARRATIVE_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, df, cutoff=5):
        target_vocab = Vocabulary()        
        for target in sorted(set(df.target)):
            target_vocab.add_token(target)
        
        word_counts = Counter()
        for NARRATIVE in df.NARRATIVE:
            for token in NARRATIVE.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        NARRATIVE_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                NARRATIVE_vocab.add_token(word)
#             else: 
#                 print(word)
        
        return cls(NARRATIVE_vocab, target_vocab)

    @classmethod
    def from_serializable(cls, contents):
        title_vocab = SequenceVocabulary.from_serializable(contents['NARRATIVE_vocab'])
        category_vocab = Vocabulary.from_serializable(contents['target_vocab'])
        return cls(NARRATIVE_vocab=NARRATIVE_vocab, target_vocab=target_vocab)

    def to_serializable(self):
        return {'NARRATIVE_vocab': self.NARRATIVE_vocab.to_serializable(), 'target_vocab': self.target_vocab.to_serializable()}

class GloveDataset(Dataset):
    def __init__(self, df, vectorizer):
        self.df = df
        self._vectorizer = vectorizer

        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, df.NARRATIVE)) + 2
        
        self.train_df = self.df[self.df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.df[self.df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.df[self.df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 'val': (self.val_df, self.validation_size), 'test': (self.test_df, self.test_size)}
        self.set_split('train')

        # Class weights
        class_counts = df.target.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.target_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv):
        df = pd.read_csv(news_csv)
        train_df = df[df.split=='train']
        return cls(df, GloveVectorizer.from_dataframe(train_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        NARRATIVE_vector = self._vectorizer.vectorize(row.NARRATIVE, self._max_seq_length)
        target_index = self._vectorizer.target_vocab.lookup_token(row.target) #-1
        return {'x_data': NARRATIVE_vector,'y_target': target_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

class GloveClassifier(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_channels, hidden_dim, num_classes, dropout_p, pretrained_embeddings=None, padding_idx=0):
        super(GloveClassifier, self).__init__()

        if pretrained_embeddings is None:
            self.emb = nn.Embedding(embedding_dim=embedding_size, num_embeddings=num_embeddings, padding_idx=padding_idx)        
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=embedding_size, num_embeddings=num_embeddings, padding_idx=padding_idx,      _weight=pretrained_embeddings)
            
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size, out_channels=num_channels[0], kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=num_channels[0], out_channels=num_channels[1], kernel_size=2, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=num_channels[1], out_channels=num_channels[2], kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=num_channels[2], out_channels=num_channels[3], kernel_size=2),
            nn.ReLU()
        )
        #MLP         
        self._dropout_p = dropout_p
        self.fc1 = nn.Linear(num_channels[3], hidden_dim[0])
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.1)
        
        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.relu2 = nn.ReLU()
        
        self.fc3 = nn.Linear(hidden_dim[1], num_classes)

    def forward(self, x_in, apply_softmax=False):
        # embed and permute so features are channels
        x_embedded = self.emb(x_in).permute(0, 2, 1)
        features = self.convnet(x_embedded)
        # average and remove the extra dimension
        remaining_size = features.size(dim=2)
        features = nn.functional.max_pool1d(features, remaining_size).squeeze(dim=2)
        features = nn.functional.dropout(features, p=self._dropout_p)
        
        out = self.fc1(features)
#         out = nn.functional.dropout(out, p=self._dropout_p)
        out = self.relu1(out)
        out = self.dropout1(out)
        
        out = self.fc2(out)
        out = self.relu2(out)
        
        out = self.fc3(out)

        if apply_softmax:
            out = torch.nn.functional.softmax(out, dim=1)
        return out

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [12]:
if args.reload_from_files:
    # training from a checkpoint
    dataset = GloveDataset.load_dataset_and_load_vectorizer(args.degree_injury_file, args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = GloveDataset.load_dataset_and_make_vectorizer(args.degree_injury_file)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe or randomly initialized embeddings
if args.use_glove:
    words = vectorizer.NARRATIVE_vocab._token_to_idx.keys()
    embeddings, embedding_size = make_embedding_matrix(glove_filepath=args.glove_filepath, words=words)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

Using pre-trained embeddings


In [13]:
classifier = GloveClassifier(
    embedding_size=embedding_size, 
    num_embeddings=len(vectorizer.NARRATIVE_vocab),
    num_channels=[128, 64, 32, 32],
    hidden_dim=[32, 16], 
    num_classes=len(vectorizer.target_vocab), 
    dropout_p=0.3,
    pretrained_embeddings=embeddings,
    padding_idx=0
)
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

In [14]:
best_val  = {
    'epoch': 0,
    'model_state_dict': 0,
    'optimizer_state_dict': 0,
    'loss': 0,
    'acc':0
}
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=0.0001)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)

train_state = make_train_state(args)

for epoch_index in range(200):
    train_state['epoch_index'] = epoch_index
    # Iterate over training dataset
    # setup: batch generator, set loss and acc to 0, set train mode on
    
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is these 5 steps:

        # --------------------------------------
        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output
        y_pred = classifier(batch_dict['x_data'])

        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        # -----------------------------------------
        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    # Iterate over val dataset
    # setup: batch generator, set loss and acc to 0; set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
    running_loss = 0.
    running_acc = 0.
    
    classifier.eval()
    for batch_index, batch_dict in enumerate(batch_generator):
        # compute the output
        y_pred =  classifier(batch_dict['x_data'])

        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    
    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    if epoch_index%10 == 0:
        print('Train loss', round(train_state['train_loss'][-1], 4), 'Train acc', round(train_state['train_acc'][-1], 4), 'Val loss', round(train_state['val_loss'][-1], 4), 'Val acc', round(train_state['val_acc'][-1], 4))
    if(best_val['acc']<train_state['val_acc'][-1]):
        best_val  = {
            'epoch': epoch_index,
            'model_state_dict': classifier.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': running_loss,
            'acc': running_acc,
        }
    train_state = update_train_state(args=args, model=classifier, train_state=train_state)
torch.save(best_val, "Glove.h5")

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


Train loss 0.6942 Train acc 44.7939 Val loss 0.6945 Val acc 44.6541
Train loss 0.6939 Train acc 44.7939 Val loss 0.6943 Val acc 44.6541
Train loss 0.6938 Train acc 44.7939 Val loss 0.694 Val acc 44.6541
Train loss 0.6937 Train acc 44.7939 Val loss 0.6939 Val acc 44.6541
Train loss 0.6934 Train acc 44.7939 Val loss 0.6937 Val acc 44.6541
Train loss 0.6928 Train acc 44.7939 Val loss 0.6931 Val acc 44.6541
Train loss 0.6917 Train acc 44.7939 Val loss 0.6927 Val acc 44.6541
Train loss 0.6894 Train acc 44.7939 Val loss 0.6906 Val acc 44.6541
Train loss 0.6857 Train acc 44.7939 Val loss 0.6888 Val acc 44.6541
Train loss 0.6791 Train acc 47.0999 Val loss 0.6836 Val acc 45.283
Train loss 0.6658 Train acc 58.84 Val loss 0.6694 Val acc 60.3774
Train loss 0.6391 Train acc 68.0643 Val loss 0.6569 Val acc 62.2642
Train loss 0.5962 Train acc 72.5367 Val loss 0.6291 Val acc 62.8931
Train loss 0.5537 Train acc 74.0042 Val loss 0.6077 Val acc 66.6667
Train loss 0.5014 Train acc 77.2886 Val loss 0.6091 

In [15]:
# compute the loss & accuracy on the test set using the best available model
checkpoint = torch.load("Glove.h5")
classifier.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
classifier.eval()

# classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('val')

batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
running_loss = 0.
running_acc = 0.

classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [16]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 0.6257244944572449;
Test Accuracy: 71.69811320754717


# Multi-class Document Classification

In [124]:
args = Namespace(
    injury_bodyparts_file="./data/injurybodyparts.csv",
    vectorizer_file="MCWBvectorizer.json",
    glove_filepath = "./Glove/glove.6B.200d.txt",
    model_state_file = 'RNN.h5',
    # Training hyperparameter
    val_proportion=0.1,
    test_proportion=0.2,
    learning_rate = 0.001,
    seed=666,
    dropout_p=0.1,
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    use_glove=True,
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

### Dataset classes

After few Experements we found out padding at the font is much better

In [176]:
class Vocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)
    
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

class IBPVectorizer(object):
    def __init__(self, NARRATIVE_vocab, target_vocab):
        self.NARRATIVE_vocab = NARRATIVE_vocab
        self.target_vocab = target_vocab

    def vectorize(self, NARRATIVE, vector_length=-1):
        indices = [self.NARRATIVE_vocab.begin_seq_index]
        indices.extend(self.NARRATIVE_vocab.lookup_token(token) for token in NARRATIVE.split(" "))
        indices.append(self.NARRATIVE_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)
        
        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[vector_length - len(indices):] = indices
        out_vector[:vector_length - len(indices)] = self.NARRATIVE_vocab.mask_index
        return out_vector, vector_length
    
    @classmethod
    def from_dataframe(cls, df, cutoff=5):
        target_vocab = Vocabulary()        
        for target in sorted(set(df.target)):
            target_vocab.add_token(target)
        
        word_counts = Counter()
        for NARRATIVE in df.NARRATIVE:
            for token in NARRATIVE.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        NARRATIVE_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                NARRATIVE_vocab.add_token(word)
        
        return cls(NARRATIVE_vocab, target_vocab)
    
    @classmethod
    def from_serializable(cls, contents):
        title_vocab = SequenceVocabulary.from_serializable(contents['NARRATIVE_vocab'])
        category_vocab = Vocabulary.from_serializable(contents['target_vocab'])
        return cls(NARRATIVE_vocab=NARRATIVE_vocab, target_vocab=target_vocab)

    def to_serializable(self):
        return {'NARRATIVE_vocab': self.NARRATIVE_vocab.to_serializable(), 'target_vocab': self.target_vocab.to_serializable()}

class IBPDataset(Dataset):
    def __init__(self, df, vectorizer):
        self.df = df
        self._vectorizer = vectorizer

        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, df.NARRATIVE)) + 2
        
        self.train_df = self.df[self.df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.df[self.df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.df[self.df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 'val': (self.val_df, self.validation_size), 'test': (self.test_df, self.test_size)}
        self.set_split('train')

        # Class weights
        class_counts = df.target.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.target_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv):
        df = pd.read_csv(news_csv)
        train_df = df[df.split=='train']
        return cls(df, IBPVectorizer.from_dataframe(train_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        NARRATIVE_vector, vector_length = self._vectorizer.vectorize(row.NARRATIVE, self._max_seq_length)
        target_index = self._vectorizer.target_vocab.lookup_token(row.target) #-1
        return {'x_data': NARRATIVE_vector,'y_target': target_index, 'x_length': vector_length}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

### Classfiers

In [177]:
def column_gather(y_out, x_lengths):
    x_lengths = x_lengths.long().detach().cpu().numpy() - 1

    out = []
    for batch_index, column_index in enumerate(x_lengths):
        out.append(y_out[batch_index, column_index])

    return torch.stack(out)

class IBPClassifier(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_classes, rnn_hidden_size, hidden_dim, RNN=True, pretrained_embeddings=None, padding_idx=0, batch_first=True):
        super(IBPClassifier, self).__init__()

        if pretrained_embeddings is None:
            self.emb = nn.Embedding(embedding_dim=embedding_size, num_embeddings=num_embeddings, padding_idx=padding_idx)        
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=embedding_size, num_embeddings=num_embeddings, padding_idx=padding_idx,  _weight=pretrained_embeddings)

        if RNN:
            self.rnn = VanillaRNN(input_size=embedding_size, hidden_size=rnn_hidden_size, batch_first=batch_first)
        else:
            self.rnn = LstmRNN(input_size=embedding_size, hidden_size=rnn_hidden_size, batch_first=batch_first)
#             self.rnn = nn.LSTM(embedding_size, rnn_hidden_size, batch_first=batch_first) 
        
        self.RNN = RNN
        self.num_classes = num_classes
        self.rnn_hidden_size = rnn_hidden_size
        #MLP         
        self.fc1 = nn.Linear(rnn_hidden_size, hidden_dim[0])
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.1)

        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.relu2 = nn.ReLU()

        self.fc3 = nn.Linear(hidden_dim[1], num_classes)

    def forward(self, x, x_lengths=None, apply_softmax=False):
        # embed and permute so features are channels
        out = self.emb(x)
        out = self.rnn(out)

        
        if self.RNN:
            if x_lengths is not None:
                out = column_gather(out, x_lengths)
            else:
                out = out[:, -1, :]

        out = self.fc1(out)
        out = self.relu1(out)
        out = self.dropout1(out)
        
        out = self.fc2(out)
        out = self.relu2(out)
        
        out = self.fc3(out)

        if apply_softmax:
            out = torch.nn.functional.softmax(out, dim=1)

        return out

## Vanilla RNN

### Model

In [178]:
class VanillaRNN(nn.Module):
    def __init__(self, input_size, hidden_size, batch_first=False):
        super(VanillaRNN, self).__init__()
        self.rnn_cell = nn.RNNCell(input_size, hidden_size)
        self.batch_first = batch_first
        self.hidden_size = hidden_size

    def _initial_hidden(self, batch_size):
        return torch.zeros((batch_size, self.hidden_size))

    def forward(self, x_in, initial_hidden=None):
        if self.batch_first:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()
    
        hiddens = []

        if initial_hidden is None:
            initial_hidden = self._initial_hidden(batch_size)
            initial_hidden = initial_hidden.to(x_in.device)

        hidden_t = initial_hidden
                    
        for t in range(seq_size):
            hidden_t = self.rnn_cell(x_in[t], hidden_t)
            hiddens.append(hidden_t)
            
        hiddens = torch.stack(hiddens)

        if self.batch_first:
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

### Train

In [179]:
if args.reload_from_files:
    # training from a checkpoint
    dataset = IBPDataset.load_dataset_and_load_vectorizer(args.injury_bodyparts_file, args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = IBPDataset.load_dataset_and_make_vectorizer(args.injury_bodyparts_file)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe or randomly initialized embeddings
if args.use_glove:
    words = vectorizer.NARRATIVE_vocab._token_to_idx.keys()
    embeddings, embedding_size = make_embedding_matrix(glove_filepath=args.glove_filepath, words=words)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

Using pre-trained embeddings


In [180]:
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

Using CUDA: True


In [189]:
classifier = IBPClassifier(
    embedding_size=embedding_size, 
    num_embeddings=len(vectorizer.NARRATIVE_vocab),
    num_classes=len(vectorizer.target_vocab),
    rnn_hidden_size=512,
    hidden_dim = [128, 64],
    padding_idx=vectorizer.NARRATIVE_vocab.mask_index,
    pretrained_embeddings=embeddings
)

In [190]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=0.0001)

train_state = make_train_state(args)

best_val  = {
    'epoch': 0,
    'model_state_dict': 0,
    'optimizer_state_dict': 0,
    'loss': 0,
    'acc':0
}

try:
    for epoch_index in range(200):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset
        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(x=batch_dict['x_data'], x_lengths=batch_dict['x_length'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
    
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset\
        # setup: batch generator, set loss and acc to 0; set eval mode on

        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = classifier(x=batch_dict['x_data'], x_lengths=batch_dict['x_length'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        if(best_val['acc']<train_state['val_acc'][-1]):
            best_val  = {
                'epoch': epoch_index,
                'model_state_dict': classifier.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': running_loss,
                'acc': running_acc,
            }

        if epoch_index%10 == 0:
            print('Train loss', round(train_state['train_loss'][-1], 4), 'Train acc', round(train_state['train_acc'][-1], 4), 'Val loss', round(train_state['val_loss'][-1], 4), 'Val acc', round(train_state['val_acc'][-1], 4))
            
        train_state = update_train_state(args=args, model=classifier, train_state=train_state)

except KeyboardInterrupt:
    print("Exiting loop")
torch.save(best_val, "RNNGlove.h5")

Train loss 1.6119 Train acc 6.4925 Val loss 1.6131 Val acc 6.383
Train loss 1.5998 Train acc 16.3895 Val loss 1.6041 Val acc 14.8936
Train loss 1.5783 Train acc 30.2454 Val loss 1.5865 Val acc 24.1135
Train loss 1.5176 Train acc 40.5384 Val loss 1.5151 Val acc 38.2979
Train loss 1.4725 Train acc 41.9636 Val loss 1.4954 Val acc 39.7163
Train loss 1.4131 Train acc 48.9311 Val loss 1.434 Val acc 45.3901
Train loss 1.3488 Train acc 41.4885 Val loss 1.2987 Val acc 54.6099
Train loss 1.2416 Train acc 53.6817 Val loss 1.2454 Val acc 48.227
Train loss 1.1366 Train acc 54.9485 Val loss 1.1298 Val acc 56.7376
Train loss 1.0602 Train acc 59.4616 Val loss 1.0812 Val acc 56.0284
Train loss 0.9745 Train acc 59.7783 Val loss 1.0191 Val acc 60.2837
Train loss 0.8832 Train acc 65.3207 Val loss 0.9567 Val acc 65.2482
Train loss 0.8488 Train acc 68.4877 Val loss 0.9567 Val acc 63.1206
Train loss 0.793 Train acc 67.5376 Val loss 0.9572 Val acc 59.5745
Train loss 0.7472 Train acc 71.2589 Val loss 0.9934 Va

In [191]:
checkpoint = torch.load("RNNGlove.h5")
classifier.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
classifier.eval()

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'], x_lengths=batch_dict['x_length'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 1.0190712213516235;
Test Accuracy: 74.46808510638297


## LSTM

In [192]:
class LstmRNN(nn.Module):
    def __init__(self, input_size, hidden_size, batch_first=True):
        super(LstmRNN, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=batch_first) 
        self.hidden_size = hidden_size
        self.batch_first = batch_first
        
    def forward(self, x):
        if self.batch_first:
            batch_first_output, (self.hidden, self.cell) = self.lstm(x)
        else:
            batch_first_output, (self.hidden, self.cell) = self.lstm(x)
        return batch_first_output[:, -1]

In [193]:
classifier = IBPClassifier(
    embedding_size=embedding_size, 
    num_embeddings=len(vectorizer.NARRATIVE_vocab),
    num_classes=len(vectorizer.target_vocab),
    rnn_hidden_size=512,
    hidden_dim = [256, 64],
    RNN=False,
    padding_idx=vectorizer.NARRATIVE_vocab.mask_index,
    pretrained_embeddings=embeddings
)

In [194]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=0.0001)

train_state = make_train_state(args)

best_val  = {
    'epoch': 0,
    'model_state_dict': 0,
    'optimizer_state_dict': 0,
    'loss': 0,
    'acc':0
}

try:
    for epoch_index in range(200):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset
        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(x=batch_dict['x_data'], x_lengths=batch_dict['x_length'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
    
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset\
        # setup: batch generator, set loss and acc to 0; set eval mode on

        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = classifier(x=batch_dict['x_data'], x_lengths=batch_dict['x_length'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        if(best_val['acc']<train_state['val_acc'][-1]):
            best_val  = {
                'epoch': epoch_index,
                'model_state_dict': classifier.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': running_loss,
                'acc': running_acc,
            }

        if epoch_index%10 == 0:
            print('Train loss', round(train_state['train_loss'][-1], 4), 'Train acc', round(train_state['train_acc'][-1], 4), 'Val loss', round(train_state['val_loss'][-1], 4), 'Val acc', round(train_state['val_acc'][-1], 4))
            
        train_state = update_train_state(args=args, model=classifier, train_state=train_state)

except KeyboardInterrupt:
    print("Exiting loop")
torch.save(best_val, "LSTMGlove.h5")

Train loss 1.6112 Train acc 21.7736 Val loss 1.6105 Val acc 21.9858
Train loss 1.6062 Train acc 21.7736 Val loss 1.606 Val acc 21.9858
Train loss 1.5979 Train acc 32.2249 Val loss 1.5974 Val acc 33.3333
Train loss 1.5762 Train acc 37.3713 Val loss 1.5742 Val acc 34.7518
Train loss 1.5025 Train acc 34.2835 Val loss 1.4583 Val acc 35.461
Train loss 1.4033 Train acc 40.9343 Val loss 1.3786 Val acc 48.227
Train loss 1.306 Train acc 46.0808 Val loss 1.2405 Val acc 56.0284
Train loss 1.1694 Train acc 58.8282 Val loss 1.183 Val acc 56.7376
Train loss 1.0378 Train acc 65.7165 Val loss 1.1411 Val acc 62.4113
Train loss 0.9124 Train acc 66.0333 Val loss 1.0685 Val acc 64.539
Train loss 0.7956 Train acc 71.6548 Val loss 1.0085 Val acc 67.3759
Train loss 0.7246 Train acc 74.901 Val loss 0.9703 Val acc 67.3759
Train loss 0.6182 Train acc 77.1971 Val loss 0.9625 Val acc 70.2128
Train loss 0.5634 Train acc 77.5139 Val loss 0.9201 Val acc 74.4681
Train loss 0.48 Train acc 82.977 Val loss 0.9213 Val ac

In [195]:
checkpoint = torch.load("LSTMGlove.h5")
classifier.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
classifier.eval()

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'], x_lengths=batch_dict['x_length'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 1.0109914541244507;
Test Accuracy: 74.46808510638297


# NER

# Bonus

In [48]:
mask = (df['NARRATIVE'].str.len() < 30)
df.loc[mask]['NARRATIVE'].values