In [6]:
import pandas as pd
import numpy as np
import re
import string
import gensim

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

import multiprocessing as mp
from spellchecker import SpellChecker

import torch
import torch.nn as nn
import torch.optim as optim
import math
from argparse import Namespace

from torch.utils.data import Dataset, DataLoader
from collections import Counter

# Init

In [7]:
if not nltk.find('corpora/wordnet'):
     nltk.download('wordnet')
porter_stemmer  = PorterStemmer()
lemmatizer      = WordNetLemmatizer()
regex_tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
spell  = SpellChecker()
one_hot_vectorizer = CountVectorizer(binary=True)

In [8]:
args = Namespace(
    # Data and Path hyper parameters
    degree_injury_file="./data/degreeinjury.csv",
    injury_bodyparts_file="./data/injurybodyparts.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.h5",
    save_dir="model_storage\document_classification",
    # Model hyper parameters
    use_glove=False,
    embedding_size=100, 
    # Training hyper parameter
    window_size=5,
    val_proportion=0.1,
    test_proportion=0.2,
    learning_rate = 0.001,
    seed=666,
    dropout_p=0.1, 
    batch_size=256, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

## Functions

In [9]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
def load_glove_from_file(glove_filepath):
    word_to_index = {}
    embeddings = []
    with open(glove_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i

    return final_embeddings

def clean_text(text):
    txt = ''
    txt = txt.strip()
    txt = re.sub(r"([.,!?])", r" \1 ", txt)
    for word in tokenize(text):
        # The EE is short hand on employee
        if word=='ee' or word == 'EE':
            txt += ' employee '
        elif word in string.punctuation:
            txt = txt + ' ' +word
        elif len(word)>2:
            word = spell.correction(word)
            txt = txt + ' ' + word.lower().strip()
    txt = re.sub(r"[^a-zA-Z.,!?]+", r" ", txt)
    return txt.strip()

def tokenize(text):
    return nltk.word_tokenize(text)

def stem(words, df=False):
    stemmed_words=[porter_stemmer.stem(word) for word in words]
    if df:
        return pd.DataFrame({'original': words,'stemmed': stemmed_words})
    return stemmed_words

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return None

def lemmatize(words, df=False):
    lemmatized_words=[]
    tagged_sent = nltk.pos_tag(words)
    
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or nltk.corpus.wordnet.NOUN
        if wordnet_pos is None:
            lemmatized_words.append(lemmatizer.lemmatize(tag[0]))
        else:
            lemmatized_words.append(lemmatizer.lemmatize(tag[0], pos=wordnet_pos))
    
    if df:
        return pd.DataFrame({'original': words,'lemmatized': lemmatized_words})
    return lemmatized_words

def SK_TFIDF_stopwords(corpus, vectorizer):
    vectorizer.fit(corpus)
    X = vectorizer.transform(corpus)
    return vectorizer.stop_words_, X

def get_words(texts):
    words = []
    for text in texts:
        words+=regex_tokenizer.tokenize(text)
    return np.asarray(words).reshape(-1, 1)

def OHE_2D(sent, sparse=False):
    enc = OneHotEncoder(handle_unknown='ignore', sparse=sparse)
    words=[]
    for w in sent.values:
        words += w
    X = enc.fit(np.array(words).reshape(-1,1))
    
    # Round up to without unit digit
    ohe = sent.apply(lambda x: enc.transform(np.array(x).reshape(-1,1)))
    maxwords = max([len(x) for x in ohe])
    maxwords = math.ceil(maxwords/10)*10
    dim = len(ohe[0][0])
    print('Vector length:          ', dim, '\nMaximum number of words:', maxwords)

    return [np.concatenate((x, np.zeros((maxwords - len(x), dim))), axis=0) for x in ohe]

def OHE_1D(sent):
    corpus = [' '.join(x) for x in sent.values]
    print(corpus[0])
    ohe = CountVectorizer(binary=True)
    return pd.DataFrame(ohe.fit_transform(corpus).todense(), columns=ohe.get_feature_names())

def spell_check(words):
    return [spell.correction(word) for word in words]

def remove_stopwords(words, stopwords):
    return [w for w in words if w not in stopwords]

def split_save(data, path, test=0.2, valid= 0.1):
    train, test  = train_test_split(data,  test_size=test, random_state=666, shuffle=True, stratify=data['target'])
    train, valid = train_test_split(train, test_size=valid, random_state=666, shuffle=True, stratify=train['target'])
    train['split'] = 'train'
    test['split']  = 'test'
    valid['split'] = 'val'
    pd.concat([train, valid, test]).to_csv(path, index=False)

In [10]:
# Clean Narrative is computionally expensive run if only necessary

# Clean Narrative

In [78]:
df = pd.read_csv('us_data_2000.csv')
df = df[['DEGREE_INJURY', 'DEGREE_INJURY_CD', 'INJ_BODY_PART_CD', 'INJ_BODY_PART', 'NARRATIVE']]
df.head(1)

Unnamed: 0,DEGREE_INJURY,DEGREE_INJURY_CD,INJ_BODY_PART_CD,INJ_BODY_PART,NARRATIVE
0,DAYS RESTRICTED ACTIVITY ONLY,5,700,MULTIPLE PARTS (MORE THAN ONE MAJOR),Employee was cleaning up at the Primary Crushe...


In [79]:
df.isnull().any()

DEGREE_INJURY       False
DEGREE_INJURY_CD    False
INJ_BODY_PART_CD    False
INJ_BODY_PART       False
NARRATIVE           False
dtype: bool

In [80]:
print('NARRATIVE NaN count:   ', df[df['NARRATIVE'].isna()].shape[0])
print('NARRATIVE len>10 count:', df[df['NARRATIVE'].str.len() < 10].shape[0])
print('\nDEGREE_INJURY_CD "?" count: ', df[df['DEGREE_INJURY_CD'] == '?'].shape[0])
print('\nINJ_BODY_PART_CD "?" count: ', df[df['INJ_BODY_PART_CD'] == '?'].shape[0])

NARRATIVE NaN count:    0
NARRATIVE len>10 count: 1

DEGREE_INJURY_CD "?" count:  11

INJ_BODY_PART_CD "?" count:  242


#### Remove Narrative are too short and NA

In [81]:
print("Origin data size:      ", df.shape[0])
df = df[df['NARRATIVE'].notna()]
df = df[df['NARRATIVE'].str.len() > 10]
print("Clean data size:", df.shape[0])

Origin data size:       2000
Clean data size: 1999


### To Lower case and correct words

In [82]:
with mp.Pool(mp.cpu_count()) as pool:
    df['NARRATIVE'] = pool.map(clean_text, df['NARRATIVE'])
df['NARRATIVE'].values[0]

'employee was cleaning the primary crusher with the dingo skid steer . the employee slipped and fell while operating the skid steer and the machine pinned him against the cement retaining wall .'

## Dataset for task 1

In [83]:
df_degree=df[['NARRATIVE', 'DEGREE_INJURY_CD', 'DEGREE_INJURY']].copy()
df_degree = df_degree[df_degree['DEGREE_INJURY_CD'] != '?']
df_degree['DEGREE_INJURY_CD'] = pd.to_numeric(df_degree['DEGREE_INJURY_CD'])

def get_degree_injury(dataframe):
    CD = []
    INJURY = []
    for i in range(0, len(dataframe.DEGREE_INJURY_CD.unique())):
        CD.append(i)
        INJURY.append(dataframe[dataframe['DEGREE_INJURY_CD'] == i]['DEGREE_INJURY'].values[0])
    return pd.DataFrame({'DEGREE_INJURY_CD': CD,'DEGREE_INJURY': INJURY})

get_degree_injury(df_degree)

Unnamed: 0,DEGREE_INJURY_CD,DEGREE_INJURY
0,0,ACCIDENT ONLY
1,1,FATALITY
2,2,PERM TOT OR PERM PRTL DISABLTY
3,3,DAYS AWAY FROM WORK ONLY
4,4,DYS AWY FRM WRK & RESTRCTD ACT
5,5,DAYS RESTRICTED ACTIVITY ONLY
6,6,"NO DYS AWY FRM WRK,NO RSTR ACT"
7,7,OCCUPATNAL ILLNESS NOT DEG 1-6
8,8,INJURIES DUE TO NATURAL CAUSES
9,9,INJURIES INVOLVNG NONEMPLOYEES


In [84]:
100*df_degree['DEGREE_INJURY'].value_counts(normalize=True)

DAYS AWAY FROM WORK ONLY          29.879276
NO DYS AWY FRM WRK,NO RSTR ACT    27.766600
DAYS RESTRICTED ACTIVITY ONLY     18.058350
ACCIDENT ONLY                     11.016097
DYS AWY FRM WRK & RESTRCTD ACT     7.293763
OCCUPATNAL ILLNESS NOT DEG 1-6     2.867203
ALL OTHER CASES (INCL 1ST AID)     1.006036
PERM TOT OR PERM PRTL DISABLTY     0.905433
FATALITY                           0.553320
INJURIES DUE TO NATURAL CAUSES     0.503018
INJURIES INVOLVNG NONEMPLOYEES     0.150905
Name: DEGREE_INJURY, dtype: float64

In [85]:
group1 = [
    'DAYS AWAY FROM WORK ONLY',
    'DAYS RESTRICTED ACTIVITY ONLY',
    'DYS AWY FRM WRK & RESTRCTD ACT'
]
df_degree['target'] = df_degree['DEGREE_INJURY'].isin(group1)
df_degree["target"] = df_degree["target"].astype(str)
del df_degree['DEGREE_INJURY']
del df_degree['DEGREE_INJURY_CD']
split_save(df_degree, path=args.degree_injury_file)

### Dataset for task 2

In [86]:
HEAD     = ['SKULL', 'EAR(S) INTERNAL & HEARING', 'EAR(S) EXTERNAL', 'EAR(S) INTERNAL & EXTERNAL' ,'EYE(S) OPTIC NERVE/VISON', 'NOSE/NASAL PASSAGES/SINUS/SMELL', 'EAR(S) INTERNAL & HEARING', 'BRAIN', 'FACE,NEC', 'FACE, MULTIPLE PARTS', 'HEAD, MULTIPLE PARTS', 'HEAD,NEC', 'MULTIPLE PARTS', 'MOUTH/LIP/TEETH/TONGUE/THROAT/TASTE', 'JAW INCLUDE CHIN', 'SCALP']
LEG      = ['LEG, MULTIPLE PARTS', 'LOWER LEG/TIBIA/FIBULA', 'FOOT(NOT ANKLE/TOE)/TARSUS/METATARSUS', 'LEG, NEC', 'KNEE/PATELLA', 'TOE(S)/PHALANGES', 'ANKLE', 'THIGH/FEMUR', 'LOWER EXTREMITIES,NEC', 'LOWER EXTREMITIES, MULTIPLE PARTS']
ARM      = ['ARM, MULTIPLE PARTS', 'ARM,NEC', 'HAND (NOT WRIST OR FINGERS)', 'WRIST', 'FINGER(S)/THUMB', 'UPPER ARM/HUMERUS', 'SHOULDERS (COLLARBONE/CLAVICLE/SCAPULA)', 'ELBOW', 'FOREARM/ULNAR/RADIUS', 'UPPER EXTREMITIES, MULTIPLE', 'UPPER EXTREMITIES, NEC']
BODY     = ['CHEST (RIBS/BREAST BONE/CHEST ORGNS)', 'BODY SYSTEMS', 'ABDOMEN/INTERNAL ORGANS', 'BODY PARTS, NEC', 'TRUNK, MULTIPLE PARTS', 'TRUNK,NEC', 'HIPS (PELVIS/ORGANS/KIDNEYS/BUTTOCKS)']
BACK     = ['BACK (MUSCLES/SPINE/S-CORD/TAILBONE)']
MULTIPLE = ['MULTIPLE PARTS (MORE THAN ONE MAJOR)']

df_parts=df[['NARRATIVE', 'INJ_BODY_PART_CD', 'INJ_BODY_PART']].copy()
df_parts = df_parts[df_parts['INJ_BODY_PART_CD'] != '?']
del df_parts['INJ_BODY_PART_CD']

df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(HEAD)]     = 'Head'
df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(LEG)]      = 'Leg'
df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(ARM)]      = 'Arm'
df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(BODY)]     = 'Body'
df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(BACK)]     = 'Back'
df_parts['INJ_BODY_PART'][df_parts['INJ_BODY_PART'].isin(MULTIPLE)] = 'Multiple Parts'
df_parts['target'] = df_parts['INJ_BODY_PART']
del df_parts['INJ_BODY_PART']
split_save(df_degree, path=args.injury_bodyparts_file)

### Dataset for task 3

In [87]:
# TODO

## Data Prepreocessing and Preparation 

#### Test lem and stem

In [88]:
df_stem      = stem(nltk.tokenize.word_tokenize((df['NARRATIVE'].values[0])), df=True)
df_lemmatize = lemmatize(nltk.tokenize.word_tokenize((df['NARRATIVE'].values[0])), df=True)
df_stem['lemmatized'] = df_lemmatize['lemmatized']

In [89]:
df_stem[df_stem['stemmed'] != df_stem['lemmatized']]

Unnamed: 0,original,stemmed,lemmatized
0,employee,employe,employee
1,was,wa,be
4,primary,primari,primary
13,employee,employe,employee
18,operating,oper,operate
24,machine,machin,machine


### Test Stopwords

In [90]:
vectorizer = TfidfVectorizer(max_df=0.13, min_df=1)
_df = df[["NARRATIVE"]].copy()
_df['NARRATIVE'] = _df['NARRATIVE'].apply(lambda x: tokenize(x))
_df['NARRATIVE'] = _df['NARRATIVE'].apply(lambda x: lemmatize(x))
_df['NARRATIVE'] = _df['NARRATIVE'].apply(lambda x: stem(x))

stopwrods, X = SK_TFIDF_stopwords(_df['NARRATIVE'].apply(lambda x: ' '.join(x)), vectorizer)

terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
sums = X.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append( (term, sums[0,col] ))
print('Number of stopwords:', len(stopwrods))
print('Number of terms    :', len(terms))

ranking = pd.DataFrame(data, columns=['term','rank'])
print('Ascending')
print(ranking.sort_values('rank', ascending=True)[0:15])

print('\nDescending')
print(ranking.sort_values('rank', ascending=False)[0:15])

Number of stopwords: 19
Number of terms    : 3105
Ascending
               term      rank
1893  perpendicular  0.144312
198            babi  0.150171
36       acupunctur  0.150171
3006             wd  0.151109
2909    unknowingli  0.151109
2401        silicon  0.158104
2813            tow  0.158681
1860       particip  0.159118
904         exercis  0.159118
1302            ind  0.163037
2422            sky  0.167821
1038          forth  0.167821
825          easier  0.167821
2847         tricep  0.168713
2809           toss  0.168713

Descending
        term       rank
938     fall  52.162024
976   finger  43.838449
633      cut  43.770485
2232    rock  42.809985
2240    roof  42.396292
306     bolt  41.885435
2568    step  37.096267
1021    foot  35.736065
1813     out  35.322635
2607  strike  34.484985
1845    pain  34.337641
1914    piec  34.113770
2854   truck  32.382425
958     felt  31.701446
2725    that  31.464384




In [97]:
for w in ['abb','abc','dia', 'yes','yee', 'that']:
        stopwrods.add(w)

In [98]:
print(stopwrods)

{'work', 'when', 'the', 'hi', 'that', 'and', 'right', 'have', 'hand', 'be', 'off', 'with', 'employe', 'caus', 'yes', 'abb', 'fell', 'slip', 'left', 'back', 'abc', 'while', 'from', 'yee', 'dia'}


# Function repeative Prepreocessing and Preparation

In [100]:
def tok_lem_sem(text):
    words = tokenize(text)
    words = lemmatize(words)
    words = stem(words)
    return words

def removestop_join(words, stopwords):
    words = remove_stopwords(words, stopwords)
    return ' '.join(words)

def prepreocessing(data, stop, target="NARRATIVE"):
    _df=data.copy()
    with mp.Pool(mp.cpu_count()) as pool:
        _df[target] = pool.map(tok_lem_sem, _df[target])
    _df[target] = _df.apply(lambda row: removestop_join(row[target], stop), axis=1)
    return _df

In [101]:
prepreocessing(pd.read_csv(args.degree_injury_file), stopwrods).to_csv(args.degree_injury_file, index=False)

In [102]:
prepreocessing(pd.read_csv(args.injury_bodyparts_file), stopwrods).to_csv(args.injury_bodyparts_file, index=False)

# Binary Document Classification

In [11]:
class OHEVocabulary(object):
    """Class to process text and extract Vocabulary for mapping"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                                for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = 1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx, 'add_unk': self._add_unk, 'unk_token': self._unk_token}
    @classmethod
    
    def from_serializable(cls, contents):
        return cls(**contents)
    
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

class OHESequenceVocabulary(OHEVocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(OHESequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(OHESequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

class OHEVectorizer(object):
    def __init__(self, NARRATIVE_vocab, target_vocab):
        self.NARRATIVE_vocab = NARRATIVE_vocab
        self.target_vocab = target_vocab

    def vectorize(self, NARRATIVE):
        one_hot = np.zeros(len(self.NARRATIVE_vocab), dtype=np.float32)
        for token in NARRATIVE.split(" "):
            if token not in string.punctuation:
                one_hot[self.NARRATIVE_vocab.lookup_token(token)] = 1
        return one_hot

    @classmethod
    def from_dataframe(cls, df, cutoff=25):
        target_vocab = OHEVocabulary()        
        for target in sorted(set(df.target)):
            target_vocab.add_token(target)

        word_counts = Counter()
        for NARRATIVE in df.NARRATIVE:
            for token in NARRATIVE.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        NARRATIVE_vocab = OHESequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                NARRATIVE_vocab.add_token(word)
        
        return cls(NARRATIVE_vocab, target_vocab)

    @classmethod
    def from_serializable(cls, contents):
        NARRATIVE_vocab =   OHESequenceVocabulary.from_serializable(contents['NARRATIVE_vocab'])
        target_vocab =  OHEVocabulary.from_serializable(contents['target_vocab'])

        return cls(NARRATIVE_vocab=NARRATIVE_vocab, target_vocab=target_vocab)

    def to_serializable(self):
        return {'NARRATIVE_vocab': self.NARRATIVE_vocab.to_serializable(),
                'target_vocab': self.target_vocab.to_serializable()}

class OHEDataset(Dataset):
    def __init__(self, df, vectorizer):
        self.df = df
        self._vectorizer = vectorizer
        
        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, df.NARRATIVE)) + 2
        
        self.train_df = self.df[self.df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.df[self.df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.df[self.df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),'val': (self.val_df, self.validation_size),'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, csv):
        df = pd.read_csv(csv)
        return cls(df, OHEVectorizer.from_dataframe(df))
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        NARRATIVE_vector = self._vectorizer.vectorize(row.NARRATIVE)
        target_index = self._vectorizer.target_vocab.lookup_token(row.target)-1
        return {'x_data': NARRATIVE_vector,'y_target': target_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

class OHEClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim=1):
        super(OHEClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dims[0])
        self.relu1 = nn.ReLU()
        
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.relu2 = nn.ReLU()
        
        self.fc3 = nn.Linear(hidden_dims[1], hidden_dims[2])
        self.relu3 = nn.ReLU()
        
        self.fc4 = nn.Linear(hidden_dims[2], 1) 
    
    def forward(self, x, apply_sigmoid=False):
        out = self.fc1(x)
        out = self.relu1(out)
        
        out = self.fc2(out)
        out = self.relu2(out)
        
        out = self.fc3(out)
        out = self.relu3(out)
        
        out = self.fc4(out)
        
        if apply_sigmoid:
            out = torch.sigmoid(out)
        return out.squeeze()

In [42]:
train_state = make_train_state(args)
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

# dataset and vectorizer
dataset = OHEDataset.load_dataset_and_make_vectorizer(args.degree_injury_file)
vectorizer = dataset.get_vectorizer()
# model
classifier = OHEClassifier(input_dim=len(vectorizer.NARRATIVE_vocab), hidden_dims=[256, 256, 128])
classifier = classifier.to(args.device)
# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.0001)
train_state = make_train_state(args)
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

In [43]:
len(vectorizer.NARRATIVE_vocab)

332

In [44]:
best_val = 0
for epoch_index in range(100):
    train_state['epoch_index'] = epoch_index
    # Iterate over training dataset
    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is 5 steps:
        # step 1. zero the gradients
        optimizer.zero_grad()
        # step 2. compute the output
        y_pred = classifier(batch_dict['x_data'].float())
        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss) / (batch_index + 1)
        # step 4. use loss to produce gradients
        loss.backward()
        # step 5. use optimizer to take gradient step
        optimizer.step()
        # compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        # step 1. compute the output
        y_pred = classifier(batch_dict['x_data'].float())
        # step 2. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        # step 3. compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    if(best_val<=running_acc):
        torch.save({
            'epoch': epoch_index,
            'model_state_dict': classifier.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': running_loss,
            }, "OHE.h5")
        best_val = running_acc
    
    if epoch_index%5 == 0:
        print('Train loss', round(train_state['train_loss'][-1], 4), 'Train acc', round(train_state['train_acc'][-1], 4), 'Val loss', round(train_state['val_loss'][-1], 4), 'Val acc', round(train_state['val_acc'][-1], 4))

Train loss 0.6996 Train acc 44.7939 Val loss 0.699 Val acc 44.6541
Train loss 0.6975 Train acc 44.7939 Val loss 0.697 Val acc 44.6541
Train loss 0.6955 Train acc 44.7939 Val loss 0.6951 Val acc 44.6541
Train loss 0.6935 Train acc 44.8637 Val loss 0.6931 Val acc 45.283
Train loss 0.6914 Train acc 49.3361 Val loss 0.6911 Val acc 49.6855
Train loss 0.6891 Train acc 62.6136 Val loss 0.6888 Val acc 60.3774
Train loss 0.6862 Train acc 69.9511 Val loss 0.6861 Val acc 71.0692
Train loss 0.6826 Train acc 66.9462 Val loss 0.6828 Val acc 69.1824
Train loss 0.6781 Train acc 65.5486 Val loss 0.6786 Val acc 64.7799
Train loss 0.6724 Train acc 64.7799 Val loss 0.6734 Val acc 63.522
Train loss 0.6654 Train acc 64.2907 Val loss 0.667 Val acc 63.522
Train loss 0.6568 Train acc 64.6401 Val loss 0.6595 Val acc 62.8931
Train loss 0.6463 Train acc 65.5486 Val loss 0.6507 Val acc 62.2642
Train loss 0.6338 Train acc 67.0161 Val loss 0.6405 Val acc 65.4088
Train loss 0.619 Train acc 69.1824 Val loss 0.6288 Val

In [45]:
dataset.set_split('test')
batch_generator = generate_batches(dataset, batch_size=len(dataset),device=args.device)
running_loss = 0.
running_acc = 0.
checkpoint = torch.load("OHE.h5")
classifier.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x=batch_dict['x_data'].float())
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [46]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))

Test loss: 0.554
Test Accuracy: 74.12


## Word embedding 

# Multi-class Document Classification

# NER

# Bonus

In [48]:
mask = (df['NARRATIVE'].str.len() < 30)
df.loc[mask]['NARRATIVE'].values