<a href="https://colab.research.google.com/github/Alexjmsherman/nlp_practicum_cohort3_student/blob/master/Deep_Learning_for_NLP_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Deep Learning for NLP

##### Author: Alex Sherman | alsherman@deloitte.com


Agenda:
- Custom embeddings
- CNN
- LSTM
- Contextual Embeddings

In [1]:
import os
import numpy as np
from numpy import zeros
import pandas as pd
import requests
from zipfile import ZipFile 
from bs4 import BeautifulSoup
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, \
    Embedding, Input, GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D, GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.initializers import Constant
from keras.utils import to_categorical

Using TensorFlow backend.


In [0]:
# file names for lesson

NIH_EXPORTER_CSV = r'exporter_train_data.csv'
URL = r'https://exporter.nih.gov/ExPORTER_Catalog.aspx'

In [3]:
# optional  - mount google drive to save data (to avoid repeating large file downloads)

from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


# Problem Definition

Predict the National Institutes of Health (NIH) Institute of Center (IC) from the Project Title of previously funded projects

### Download and Store Data 

##### WARNING - LARGE DOWNLOAD (1 HR+)
- It is recommended to skip this section as data is provided separately below

- If you want to run the code in this section you must uncomment !wget which is commented to avoid accidental large file downloads

In [0]:
# get the csv names of all project csvs at https://exporter.nih.gov

r = requests.get(URL)
b = BeautifulSoup(r.text)
table = b.find('table', attrs={'class':'header_band_bg'})

for row in table.find('table').find_all('tr', attrs={'class':'row_bg'}):
    row_cells = row.find_all('td')
    fname = row_cells[0].text.strip()
    csv_url = 'https://exporter.nih.gov/' + row_cells[4].find('a')['href']
    
    # download file
    #!wget {csv_url}     # UNCOMMENT TO DOWNLOAD FILES (COMMENTED OUT AS A SAFEGUARD)

In [5]:
%%time

# filter downloaded csvs to relevant columns and save data

# only include .zip files
nih_project_csvs = [f for f in os.listdir() if f.endswith('.zip')]

for ind, f in enumerate(nih_project_csvs):

    # view the data
    df = pd.read_csv(f, encoding='latin-1')

    # filter to relevant columns
    df = df[['ADMINISTERING_IC', 'FY',  'IC_NAME', 'PROJECT_TITLE']]
    df['SOURCE'] = f
    
    # only save column headers for the first file
    if ind == 0:
        df.to_csv(NIH_EXPORTER_CSV, index=False, mode='w+')
    else:
        df.to_csv(NIH_EXPORTER_CSV, index=False, header=None, mode='a')

CPU times: user 243 µs, sys: 39 µs, total: 282 µs
Wall time: 200 µs


In [6]:
# copy data to google drive
# NOTE - you will need to create these folders (e.g. nih_data) in google drive for this to work

!cp exporter.csv gdrive/My\ Drive/Colab\ Notebooks/nih_data/exporter_train_data.csv

cp: cannot stat 'exporter.csv': No such file or directory


### Load Data

In [0]:
# Load trainng data 

!cp gdrive/My\ Drive/Colab\ Notebooks/nih_data/exporter_train_data.csv .

df = pd.read_csv(NIH_EXPORTER_CSV, encoding='latin-1')

In [0]:
# load testing data

!cp gdrive/My\ Drive/Colab\ Notebooks/nih_data/test_data_RePORTER_PRJ_C_FY2017.csv .

test_df = pd.read_csv(
    'test_data_RePORTER_PRJ_C_FY2017.csv', 
    encoding='latin-1',
    header=0,
    names=['PROJECT_TITLE','IC_NUM']
)

### Clean Data

In [0]:
# remove test data from training data
test_file = df['SOURCE'] == 'RePORTER_PRJ_C_FY2017.zip'
test_titles = df['PROJECT_TITLE'].isin(test_df['PROJECT_TITLE'])
df = df[~(test_titles & test_file)]

In [0]:
# limit to the most freqent institutes/centers (IC)
# use same mapping order as testing data (from previous lesson)

top_ic_names = {
 'NATIONAL INSTITUTE OF MENTAL HEALTH': 0,
 'NATIONAL CANCER INSTITUTE': 1,
 'NATIONAL INSTITUTE OF ENVIRONMENTAL HEALTH SCIENCES': 2,
 'NATIONAL INSTITUTE OF GENERAL MEDICAL SCIENCES': 3,
 'NATIONAL INSTITUTE OF NEUROLOGICAL DISORDERS AND STROKE': 4,
 'NATIONAL INSTITUTE ON AGING': 5,
 'NATIONAL HEART, LUNG, AND BLOOD INSTITUTE': 6,
 'NATIONAL INSTITUTE OF DIABETES AND DIGESTIVE AND KIDNEY DISEASES': 7,
 'EUNICE KENNEDY SHRIVER NATIONAL INSTITUTE OF CHILD HEALTH & HUMAN DEVELOPMENT': 8,
 'NATIONAL EYE INSTITUTE': 9,
 'NATIONAL INSTITUTE ON DRUG ABUSE': 10,
 'NATIONAL INSTITUTE OF ALLERGY AND INFECTIOUS DISEASES': 11,
 'NATIONAL INSTITUTE OF ARTHRITIS AND MUSCULOSKELETAL AND SKIN DISEASES': 12
}

In [10]:
# filter data to most frequent ICs
df = df[df['IC_NAME'].isin(top_ic_names)]
df = df[df['PROJECT_TITLE'].notnull()]

# set the labels as a new column
df['IC_NUM'] = df['IC_NAME'].map(top_ic_names)

# create a map of IC nums to names for later reference
ic_name_map = {num:name for num, name in df[['IC_NUM','IC_NAME']].drop_duplicates().values}

# view data
print(df.shape)
df.head()

(1686687, 6)


Unnamed: 0,ADMINISTERING_IC,FY,IC_NAME,PROJECT_TITLE,SOURCE,IC_NUM
0,DA,2018,NATIONAL INSTITUTE ON DRUG ABUSE,HIV and Other Infectious Consequences of Subst...,RePORTER_PRJ_C_FY2019_009.zip,10
1,MH,2019,NATIONAL INSTITUTE OF MENTAL HEALTH,Predictive Coding as a Framework for Understan...,RePORTER_PRJ_C_FY2019_009.zip,0
2,HL,2018,"NATIONAL HEART, LUNG, AND BLOOD INSTITUTE",The role of the gut microbiome-host metabolome...,RePORTER_PRJ_C_FY2019_009.zip,6
3,AI,2019,NATIONAL INSTITUTE OF ALLERGY AND INFECTIOUS D...,Liver resident memory for malaria,RePORTER_PRJ_C_FY2019_009.zip,11
4,AI,2019,NATIONAL INSTITUTE OF ALLERGY AND INFECTIOUS D...,Novel Biomolecular and Biophysical Mechanisms ...,RePORTER_PRJ_C_FY2019_009.zip,11


### Preprocess data and create Train/Test splits

In [0]:
def preprocess_text(text):
    """ use gensim simple_preprocess to tokenize text. Join results back 
    into a clean text string
    
    :param text: string, text to preprocess
    :return clean_text: string, cleaned text
    """
    
    clean_tokens = simple_preprocess(text)
    clean_text = ' '.join(clean_tokens)
    
    return clean_text

In [12]:
%%time

# separate the features and response
X_train = df['PROJECT_TITLE'].apply(lambda x: preprocess_text(x))
y_train = df['IC_NUM']
X_test = test_df['PROJECT_TITLE'].apply(lambda x: preprocess_text(x))
y_test = test_df['IC_NUM']

# get a count of the number of possible categories to predict
num_classes = len(set(y_train))

# convert the training and testing dataset
y_train_array = to_categorical(y_train, num_classes)
y_test_array = to_categorical(y_test, num_classes)

CPU times: user 17.3 s, sys: 108 ms, total: 17.4 s
Wall time: 17.4 s


## Baseline Model

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [0]:
%%time 

# NLP Pipeline
pipe = Pipeline([
      ('tfidf', TfidfVectorizer())
    , ('clf', LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42))
])

# Hyperparameters to test
param_dist = {
       #  tfidf hyperparams
         'tfidf__max_features': [15000, 20000, 25000, 30000]
       , 'tfidf__ngram_range': [(1,1),(1,2)]
       
       #   logistic regression hyperparams
       ,  'clf__penalty':['l1','l2']
       ,  'clf__C':np.linspace(.01, 5)
}

# run experiments to determine best pipeline
grid = RandomizedSearchCV(
      pipe
    , param_distributions=param_dist
    , n_iter=5
    , cv=3
    , refit='neg_log_loss'
    , scoring=['accuracy','neg_log_loss','precision_macro','recall_macro','f1_macro']
    , return_train_score=True
    , error_score=0
    , n_jobs=-1
    , verbose=2
)

grid.fit(X_train, y_train)  # save testing data for final evaluation

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 207.9min finished


CPU times: user 37min 27s, sys: 20min 47s, total: 58min 15s
Wall time: 3h 59min 54s


In [0]:
# evaluate results
pd.DataFrame(grid.cv_results_)[[
  'mean_test_accuracy',
  'mean_test_f1_macro',
  'mean_test_neg_log_loss',
  'mean_test_precision_macro',
  'mean_test_recall_macro',
  'param_tfidf__max_features',
  'param_clf__C',
  'mean_fit_time',
  'mean_score_time'
]].sort_values('mean_test_accuracy', ascending=False)



Unnamed: 0,mean_test_accuracy,mean_test_f1_macro,mean_test_neg_log_loss,mean_test_precision_macro,mean_test_recall_macro,param_tfidf__max_features,param_clf__C,mean_fit_time,mean_score_time
0,0.77323,0.757626,-0.742971,0.787218,0.737926,20000,3.57429,2091.406558,53.016214
3,0.772749,0.756865,-0.745863,0.786829,0.737022,20000,3.26878,2093.881857,56.851987
4,0.768783,0.750875,-0.768219,0.783904,0.72993,20000,1.84306,1720.262235,47.069106
1,0.757975,0.737643,-0.807757,0.774065,0.715571,15000,1.1302,1354.656613,56.13321
2,0.625693,0.553095,-1.479332,0.661726,0.515852,15000,0.01,213.455879,55.952687


# Full Embedding and Model Pipeline Class

In [0]:
class EmbeddingModel:
    
    def __init__(self, X_train, X_test, y_train, y_test, 
                 max_num_words=20000, max_seq_length=50, 
                 embedding_size=50, embedding_dir=None):
                
        # set tokenizer params
        self.max_num_words = max_num_words
        self.max_seq_length = max_seq_length
        self.vocab_size = None
        self.tokenizer = None

        # format data
        self.num_classes = len(set(y_train))        
        self.y_train_array = to_categorical(y_train, self.num_classes)
        self.y_test_array = to_categorical(y_test, self.num_classes)
        self.X_train = X_train
        self.X_test = X_test
        self.X_train_sequence = self.encode_text(X_train, train=True)
        self.X_test_sequence = self.encode_text(X_test, train=False)
                
        # set embedding params
        self.embedding_dir = embedding_dir
        self.embedding_size = embedding_size
        self.embeddings_index = None
        self.embedding_matrix = None
        
        # set model params
        self.model = None
    
    def setup_model_pipeline(self):
        self.create_embeddings_index()
        self.create_embedding_matrix()
        print('model pipeline set-up complete')

    def encode_text(self, text, train=False):
        if train:
            self.tokenizer = Tokenizer(num_words=self.max_num_words)
            self.tokenizer.fit_on_texts(text)

        encoded_docs = self.tokenizer.texts_to_sequences(text)
        padded_docs = pad_sequences(
            encoded_docs,
            maxlen=self.max_seq_length,
            padding='post'
        )

        print(f'completed tokenizing and padding text - train: {train}')
        self.vocab_size = len(self.tokenizer.word_index) + 1
        return padded_docs

    def create_embeddings_index(self):
        embeddings_index = {}

        with open(self.embedding_dir, 'rb') as f:
            for line in f:
                values = line.split()
                word = values[0].decode('utf-8')
                embedding = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = embedding

        print('completed creating embedding index')
        self.embeddings_index = embeddings_index

    def create_embedding_matrix(self):
        embedding_matrix = zeros((self.vocab_size, self.embedding_size))

        for word, i in self.tokenizer.word_index.items():    
            embedding_vector = self.embeddings_index.get(word)

            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        print('completed creating embedding matrix')
        self.embedding_matrix = embedding_matrix

    def get_embedding_layer(self):
        embedding = Embedding(
            input_dim=self.vocab_size, 
            output_dim=self.embedding_size,                                    
            input_length=self.max_seq_length,
            embeddings_initializer=Constant(self.embedding_matrix),
            trainable=False                                   
        )

        return embedding

    def fit(self, model=None, epochs=10):
        if model:
            print('using custom model')
        else:
            # default model if a custom model is not provided
            model = Sequential()
            model.add(self.get_embedding_layer())
            model.add(Flatten())
            model.add(Dense(self.num_classes, activation='softmax'))
            model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
        
        # train model
        model.fit(
            self.X_train_sequence, 
            self.y_train_array,
            epochs=epochs,
            shuffle=True,
            validation_data=(self.X_test_sequence, self.y_test_array)
        )

        print('completed training model')
        self.model = model
        
    def predict(self, X):
        encoded_text = self.encode_text(X, train=False)
        y_pred = self.model.predict_classes(encoded_text)
        
        return y_pred

## Download Glove Embeddings

In [0]:
GLOVE_ZIP = 'glove.840B.300d.zip'
GLOVE_DIR = 'glove.840B.300d.txt'
EMBEDDING_SIZE = 300

In [16]:
# uncomment below for 2GB GLoVe Embeddings
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2019-07-16 14:34:48--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2019-07-16 14:34:48--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2019-07-16 14:34:49--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [17]:
%%time

from zipfile import ZipFile 
    
# extract all embedding files from the zip 
with ZipFile(GLOVE_ZIP, 'r') as z:  
    z.extractall()     

CPU times: user 39.8 s, sys: 6.62 s, total: 46.4 s
Wall time: 49.5 s


In [18]:
!rm -rf glove.840B.300d.zip  # delete zip after extracting embeddings
!ls

exporter_train_data.csv  sample_data
gdrive			 test_data_RePORTER_PRJ_C_FY2017.csv
glove.840B.300d.txt


## Test Embedding Model with Glove Embeddings

In [19]:
# instantiate nlp model pipeline
embedding_model = EmbeddingModel(
    X_train=X_train, 
    X_test=X_test, 
    y_train=y_train, 
    y_test=y_test,
    max_num_words=10000,
    max_seq_length=60,
    embedding_dir=GLOVE_DIR,
    embedding_size=EMBEDDING_SIZE
)

# set-up and train model
embedding_model.setup_model_pipeline()

completed tokenizing and padding text - train: True
completed tokenizing and padding text - train: False
completed creating embedding index
completed creating embedding matrix
model pipeline set-up complete


In [0]:
embedding_model.fit(epochs=10)

W0701 18:48:42.567846 139943884109696 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0701 18:48:42.627224 139943884109696 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0701 18:48:43.094040 139943884109696 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0701 18:48:43.110794 139943884109696 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0701 18:48:43.130071 139943884109696 deprecation_wrappe

Train on 1686687 samples, validate on 15341 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
completed training model


# Use content specific Embeddings

##### bioasq

"We applied word2vec to a corpus of 10,876,004 English abstracts of biomedical articles from PubMed. The resulting vectors of 1,701,632 distinct words (types) are now publicly available from http://bioasq.lip6.fr/tools/BioASQword2vec/. File size: 1.3GB (compressed), 3.5GB (uncompressed)."

SOURCE: http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts

In [20]:
%%time

!wget http://bioasq.lip6.fr/tools/BioASQword2vec  # download bio word embeddings
!mv BioASQword2vec BioASQword2vec.tar.gz          # update the downloaded file to the correct .tag.gz name
!tar -xvzf BioASQword2vec.tar.gz                  # unzip the file
!ls

--2019-07-16 14:40:41--  http://bioasq.lip6.fr/tools/BioASQword2vec
Resolving bioasq.lip6.fr (bioasq.lip6.fr)... 132.227.201.38
Connecting to bioasq.lip6.fr (bioasq.lip6.fr)|132.227.201.38|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://participants-area.bioasq.org/tools/BioASQword2vec [following]
--2019-07-16 14:40:41--  http://participants-area.bioasq.org/tools/BioASQword2vec
Resolving participants-area.bioasq.org (participants-area.bioasq.org)... 143.233.226.90
Connecting to participants-area.bioasq.org (participants-area.bioasq.org)|143.233.226.90|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://participants-area.bioasq.org/tools/BioASQword2vec/ [following]
--2019-07-16 14:40:42--  http://participants-area.bioasq.org/tools/BioASQword2vec/
Reusing existing connection to participants-area.bioasq.org:80.
HTTP request sent, awaiting response... 200 OK
Length: 1341997858 (1.2G) [application/

In [0]:
class BioasqEmbeddingModel(EmbeddingModel):

    # override the EmbeddingModel's create_embeddings_index to read in bioasq embeddings
    def create_embeddings_index(self):

        # read in a file with all the learned tokens
        with open(r'word2vecTools/types.txt', 'r') as f:
            tokens = [line.strip() for line in f]

        # read in a file with the associated embeddings for the tokens
        with open('word2vecTools/vectors.txt', 'rb') as f:
            embeddings = [np.asarray(embedding.split(), dtype='float32') for embedding in f]

        # create a dict of the word --> embedding mappings
        embeddings_index = {word:embedding for word, embedding in zip(tokens, embeddings)}

        print('completed creating pubmed embedding index')
        self.embeddings_index = embeddings_index

In [22]:
%%time

# set-up model pipeline
bioasq_model = BioasqEmbeddingModel(
    X_train=X_train, 
    X_test=X_test, 
    y_train=y_train,
    y_test=y_test,
    max_num_words=20000,
    max_seq_length=75,
    embedding_size=200
)
bioasq_model.setup_model_pipeline()

completed tokenizing and padding text - train: True
completed tokenizing and padding text - train: False
completed creating pubmed embedding index
completed creating embedding matrix
model pipeline set-up complete
CPU times: user 2min 4s, sys: 4.07 s, total: 2min 8s
Wall time: 2min 8s


In [0]:
%%time

# default model if a custom model is not provided
model = Sequential()
model.add(bioasq_model.get_embedding_layer())
model.add(Flatten())
model.add(Dense(bioasq_model.num_classes, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# train model
bioasq_model.fit(epochs=10, model=model)

using custom model
Train on 1686687 samples, validate on 15341 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
completed training model
CPU times: user 28min 2s, sys: 2min 39s, total: 30min 41s
Wall time: 21min 57s


##### Biomedical natural language processing (Pubmed, PMC, and Wikipedia combined embeddings)

NOTE: This embedding file is 4GB

"The openly available biomedical literature contains over 5 billion words in publication abstracts and full texts. Recent advances in unsupervised language processing methods have made it possible to make use of such large unannotated corpora for building statistical language models and inducing high quality vector space representations, which are, in turn, of utility in many tasks such as text classification, named entity recognition and query expansion. In this study, we introduce the first set of such language resources created from analysis of the entire available biomedical literature, including a dataset of all 1- to 5-grams and their probabilities in these texts and new models of word semantics. We discuss the opportunities created by these resources and demonstrate their application. All resources introduced in this study are available under open licenses at http://bio.nlplab.org."

SOURCE: http://bio.nlplab.org/#word-vector-tools

PUBLICATION: http://bio.nlplab.org/pdf/pyysalo13literature.pdf

In [20]:
# The following code will only work after the embedding has been downloaded and saved
# Copy embeddings from Google Drive to local Colab
!cp gdrive/My\ Drive/Colab\ Notebooks/nih_data/wikipedia-pubmed-and-PMC-w2v.bin .
!ls

exporter_train_data.csv  test_data_RePORTER_PRJ_C_FY2017.csv
gdrive			 wikipedia-pubmed-and-PMC-w2v.bin
sample_data


In [0]:
# download bio.nlplab embeddings
!wget http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin

--2019-07-14 14:17:12--  http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin
Resolving evexdb.org (evexdb.org)... 130.232.253.44
Connecting to evexdb.org (evexdb.org)|130.232.253.44|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4416560851 (4.1G) [application/octet-stream]
Saving to: ‘wikipedia-pubmed-and-PMC-w2v.bin’


2019-07-14 15:27:45 (1019 KB/s) - ‘wikipedia-pubmed-and-PMC-w2v.bin’ saved [4416560851/4416560851]



In [0]:
# OPTIONAL - move embeddings to personal Google Drive (to avoid large repeated download)
!cp wikipedia-pubmed-and-PMC-w2v.bin gdrive/My\ Drive/Colab\ Notebooks/nih_data

In [0]:
class PubmedEmbeddingModel(EmbeddingModel):

    # override the EmbeddingModel's create_embeddings_index to read in pubmed embeddings
    def create_embeddings_index(self):

        embedding_path = r'wikipedia-pubmed-and-PMC-w2v.bin'
        word_vectors = KeyedVectors.load_word2vec_format(embedding_path, binary=True)

        # create a dict of the word --> embedding mappings
        embeddings_index = {word: word_vectors.get_vector(word) for word in word_vectors.index2word}

        print('completed creating pubmed embedding index')
        self.embeddings_index = embeddings_index

In [22]:
%%time

# set-up model pipeline
pubmed_model = PubmedEmbeddingModel(
    X_train=X_train, 
    X_test=X_test, 
    y_train=y_train,
    y_test=y_test,
    max_num_words=25000,
    max_seq_length=75,
    embedding_size=200
)
pubmed_model.setup_model_pipeline()

completed tokenizing and padding text - train: True
completed tokenizing and padding text - train: False


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


completed creating pubmed embedding index
completed creating embedding matrix
model pipeline set-up complete
CPU times: user 5min 52s, sys: 8.57 s, total: 6min
Wall time: 5min 25s


In [0]:
%%time

pubmed_model.fit(epochs=10)

W0714 16:12:39.076960 140437015357312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0714 16:12:39.130147 140437015357312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0714 16:12:39.388870 140437015357312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0714 16:12:39.412070 140437015357312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0714 16:12:39.433096 140437015357312 deprecation_wrappe

Train on 1686687 samples, validate on 15341 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
completed training model
CPU times: user 29min 41s, sys: 2min 49s, total: 32min 31s
Wall time: 23min 14s


# 4. build a deep learning model (e.g. convolutional neural network), ending in a softmax output.

### Convolutional Neural Networ (CNN)

In [0]:
# define model

def build_cnn_model(model):
    # keras sequential creates models layer-by-layer 
    # doesn't create models that share layers or have multiple inputs/outputs
    cnn_model = Sequential()
    
    # load the pretrained embedding into the model
    cnn_model.add(model.get_embedding_layer())

    # create a 1D (Conv1D) convolutional layer for text (2D is for images)
    # filters: the number of features to extract from the text
    # kernel_size: the window size (how many words to look at per feature)
    cnn_model.add(Conv1D(filters=1024, kernel_size=6, activation='relu'))    
    
    # final pooling before dense layer
    cnn_model.add(GlobalMaxPooling1D())
    
    # dense layers for a feedforward neural network
    cnn_model.add(Dense(model.num_classes, activation='softmax'))
    
    # compile model to set the optimizer, loss, and metrics
    cnn_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return cnn_model

In [22]:
# set-up and train model
cnn_model = build_cnn_model(embedding_model)
embedding_model.fit(epochs=15, model=cnn_model)

using custom model
Train on 1686687 samples, validate on 15341 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
completed training model


In [25]:
cnn_model = build_cnn_model(bioasq_model)
bioasq_model.fit(epochs=20, model=cnn_model)

using custom model
Train on 1686687 samples, validate on 15341 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
completed training model


In [27]:
cnn_model = build_cnn_model(pubmed_model)
pubmed_model.fit(epochs=15, model=cnn_model)

using custom model
Train on 1686687 samples, validate on 15341 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
 268288/1686687 [===>..........................] - ETA: 4:10 - loss: 0.5110 - acc: 0.9341Buffered data was truncated after reaching the output size limit.

### LSTM

In [0]:
# define model
def build_lstm(model):
    lstm_model = Sequential()
    lstm_model.add(model.get_embedding_layer())
    lstm_model.add(LSTM(32))
    lstm_model.add(Dense(model.num_classes, activation='softmax'))
    lstm_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return lstm_model

In [0]:
# set-up and train model
lstm_model = build_lstm(embedding_model)
embedding_model.fit(epochs=5, model=lstm_model)

using custom model
Train on 1686687 samples, validate on 15341 samples
Epoch 1/5
Epoch 2/5
 150432/1686687 [=>............................] - ETA: 1:11:50 - loss: 0.9507 - acc: 0.7146Buffered data was truncated after reaching the output size limit.

In [0]:
# set-up and train model
lstm_model = build_lstm(bioasq_model)
bioasq_model.fit(epochs=6, model=lstm_model)

using custom model
Train on 1686687 samples, validate on 15341 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
completed training model


In [0]:
lstm_model = build_lstm(pubmed_model)
pubmed_model.fit(epochs=10, model=lstm_model)

W0714 20:27:45.688020 140011109181312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0714 20:27:45.776172 140011109181312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0714 20:27:46.110307 140011109181312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0714 20:27:46.404247 140011109181312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0714 20:27:46.430250 140011109181312 deprecation_wrappe

using custom model


W0714 20:27:47.276360 140011109181312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 1686687 samples, validate on 15341 samples
Epoch 1/10
Epoch 2/10
 153632/1686687 [=>............................] - ETA: 46:39 - loss: 0.9787 - acc: 0.7063Buffered data was truncated after reaching the output size limit.

In [0]:
lstm_model = build_lstm(pubmed_model)
pubmed_model.fit(epochs=25, model=lstm_model)

using custom model
Train on 46022 samples, validate on 15341 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
completed training model


# FLAIR 

#### NOTE: Due to Colab 12GB memory constraint, you should probably reset your environment before running the below code or it will likely crash

In [0]:
# install FLAIR

!pip3 install https://download.pytorch.org/whl/cpu/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
!pip3 install https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
!pip install flair
!pip install allennlp

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/4e/3a/2e777f65a71c1eaa259df44c44e39d7071ba8c7780a1564316a38bf86449/flair-0.4.2-py3-none-any.whl (136kB)
[K     |████████████████████████████████| 143kB 3.3MB/s 
[?25hCollecting sqlitedict>=1.6.0 (from flair)
  Downloading https://files.pythonhosted.org/packages/0f/1c/c757b93147a219cf1e25cef7e1ad9b595b7f802159493c45ce116521caff/sqlitedict-1.6.0.tar.gz
Collecting mpld3==0.3 (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/91/95/a52d3a83d0a29ba0d6898f6727e9858fe7a43f6c2ce81a5fe7e05f0f4912/mpld3-0.3.tar.gz (788kB)
[K     |████████████████████████████████| 798kB 45.8MB/s 
Collecting deprecated>=1.2.4 (from flair)
  Downloading https://files.pythonhosted.org/packages/88/0e/9d5a1a8cd7130c49334cce7b8167ceda63d6a329c8ea65b626116bc9e9e6/Deprecated-1.2.6-py2.py3-none-any.whl
Collecting regex (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/6f/4e/1b178c38c9a1a184288f72065

In [0]:
import flair
from flair.data import Corpus
from flair.datasets import ClassificationCorpus
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.embeddings import WordEmbeddings, FlairEmbeddings, ELMoEmbeddings, TransformerXLEmbeddings, DocumentRNNEmbeddings
import pandas as pd

# directory to download pretrained models
flair.cache_root = '/tmp/embeddings'

# this is the directory in which the train, test and dev files reside
!mkdir flair_data
data_folder = 'flair_data'
train = 'flair_data/train.txt'
test = 'flair_data/test.txt'
dev = 'flair_data/dev.txt'

In [0]:
%%time

# separate the features and response
X_train = df['PROJECT_TITLE'].apply(lambda x: preprocess_text(x))
y_train = df['IC_NUM']
X_test = test_df['PROJECT_TITLE'].apply(lambda x: preprocess_text(x))
y_test = test_df['IC_NUM']

# get a count of the number of possible categories to predict
num_classes = len(set(y_train))

# convert the training and testing dataset
y_train_array = to_categorical(y_train, num_classes)
y_test_array = to_categorical(y_test, num_classes)

CPU times: user 16.9 s, sys: 206 ms, total: 17.1 s
Wall time: 17.1 s


In [0]:
def format_data_for_flair(X, y, filename):
  """ create a .txt file with data formatted for flair (i.e. fasttext format) """
  
  # combine X and y into a single dataframe
  flair_df = pd.DataFrame(list(zip(X, y)), columns=['X','y'])
  
  # format data for flair (e.g. __label__<1> text)
  formatted_data = [f"__label__<{str(y)}> " + x for x, y in flair_df[['X','y']].values]
  
  # write to local txt file
  with open(filename, 'w+') as f:
      # write data
      for line in formatted_data[:-1]:
          # skip empty lines
          if len(line.strip().split(' ')) < 2:
             continue
          f.write(line + '\n')
      # do not add a '\n' for the last line
      f.write(formatted_data[-1])
          
format_data_for_flair(X_train, y_train, train)
format_data_for_flair(X_test, y_test, test)
format_data_for_flair(X_test, y_test, dev)

In [0]:
!head -n5 flair_data/train.txt

__label__<10> hiv and other infectious consequences of substance abuse
__label__<0> predictive coding as framework for understanding psychosis
__label__<6> the role of the gut microbiome host in heart failure related insulin resistance
__label__<11> liver resident memory for malaria
__label__<11> novel biomolecular and biophysical mechanisms of influenza virus infection


In [0]:
# copy data to google drive
# NOTE - you will need to create these folders (e.g. nih_data) in google drive for this to work

!cp flair_train.txt gdrive/My\ Drive/Colab\ Notebooks/nih_data/train.txt
!cp flair_test.txt gdrive/My\ Drive/Colab\ Notebooks/nih_data/test.txt

## Download Data formatted for FLAIR

# FLAIR NLP Modeling Pipeline

1. load corpus containing training, test and dev data
2. create the label dictionary
3. make a list of word embeddings
4. initialize document embedding by passing list of word embeddings
5. create the text classifier
6. initialize the text classifier trainer
7. start the training
8. plot training curves (optional)

In [0]:
# load corpus containing training, test and dev data
corpus = ClassificationCorpus(data_folder)
corpus

2019-07-10 00:38:20,322 Reading data from flair_data
2019-07-10 00:38:20,323 Train: flair_data/train.txt
2019-07-10 00:38:20,324 Dev: flair_data/dev.txt
2019-07-10 00:38:20,325 Test: flair_data/test.txt


<flair.datasets.ClassificationCorpus at 0x7f812a0e5d68>

In [0]:
# create the label dictionary
label_dict = corpus.make_label_dictionary()

2019-07-10 00:43:11,255 {'<1>', '<5>', '<8>', '<10>', '<11>', '<4>', '<7>', '<0>', '<2>', '<3>', '<6>', '<12>', '<9>'}


In [0]:
def flair_model(word_embeddings, hidden_size=512, learning_rate=0.1, max_epochs=5):
    """ train a classifier in FLAIR
    :param word_embeddings: selected word embeddings to use in model
    :param hidden_size: size of hidden layer
    :param learning rate: model learning rate
    :param max_epochs: epochs to train model
    """
  
    # model hyperparams
    print(f'MODEL METADATA:\n    hidden_size={hidden_size} | learning_rate={learning_rate} | max_epochs={max_epochs}\n\n')
    
    # initialize document embedding by passing list of word embeddings
    # Can choose between many RNN types 
    # (GRU by default, to change use rnn_type parameter)
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=hidden_size,
        reproject_words=True,
        reproject_words_dimension=256
    )

    # create the text classifier
    classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

    # initialize the text classifier trainer
    trainer = ModelTrainer(classifier, corpus)

    # start the training
    trainer.train(
        'flair',
        learning_rate=learning_rate,
        mini_batch_size=32,
        max_epochs=max_epochs
    )

    # plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves('flair/loss.tsv')
    plotter.plot_weights('flair/weights.txt')

In [0]:
# make a list of word embeddings
word_embeddings = [WordEmbeddings('glove')]
flair_model(word_embeddings)

2019-07-10 00:47:44,743 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmposs83xtv


100%|██████████| 160000128/160000128 [00:08<00:00, 19432496.42B/s]

2019-07-10 00:47:53,468 copying /tmp/tmposs83xtv to cache at /tmp/embeddings/embeddings/glove.gensim.vectors.npy





2019-07-10 00:47:53,769 removing temp file /tmp/tmposs83xtv
2019-07-10 00:47:54,259 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmpgcux1wgf


100%|██████████| 21494764/21494764 [00:01<00:00, 12446714.67B/s]

2019-07-10 00:47:56,459 copying /tmp/tmpgcux1wgf to cache at /tmp/embeddings/embeddings/glove.gensim
2019-07-10 00:47:56,487 removing temp file /tmp/tmpgcux1wgf



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


MODEL METADATA:
    hidden_size=512 | learning_rate=0.1 | max_epochs=5


2019-07-10 00:48:05,507 ----------------------------------------------------------------------------------------------------
2019-07-10 00:48:05,508 Evaluation method: MICRO_F1_SCORE
2019-07-10 00:48:05,922 ----------------------------------------------------------------------------------------------------
2019-07-10 00:48:06,652 epoch 1 - iter 0/52697 - loss 2.70994735
2019-07-10 00:50:05,564 epoch 1 - iter 5269/52697 - loss 1.54137151
2019-07-10 00:52:04,626 epoch 1 - iter 10538/52697 - loss 1.44024168
2019-07-10 00:54:02,436 epoch 1 - iter 15807/52697 - loss 1.39074753
2019-07-10 00:55:59,797 epoch 1 - iter 21076/52697 - loss 1.35826660
2019-07-10 00:58:00,169 epoch 1 - iter 26345/52697 - loss 1.33330112
2019-07-10 00:59:57,729 epoch 1 - iter 31614/52697 - loss 1.31519084
2019-07-10 01:01:54,970 epoch 1 - iter 36883/52697 - loss 1.30087566
2019-07-10 01:03:52,057 epoch 1 - iter 42152/52697 - loss 1.28735382
201

In [0]:
word_embeddings = [ELMoEmbeddings('pubmed')]
flair_model(word_embeddings)

100%|██████████| 336/336 [00:00<00:00, 84825.22B/s]
100%|██████████| 374434792/374434792 [00:09<00:00, 39818727.70B/s]


MODEL METADATA:
    hidden_size=512 | learning_rate=0.1 | max_epochs=5


2019-07-10 02:29:21,139 ----------------------------------------------------------------------------------------------------
2019-07-10 02:29:21,140 Evaluation method: MICRO_F1_SCORE
2019-07-10 02:29:21,596 ----------------------------------------------------------------------------------------------------
2019-07-10 02:29:22,425 epoch 1 - iter 0/52697 - loss 2.53972983
2019-07-10 02:37:46,702 epoch 1 - iter 5269/52697 - loss 1.16806028
2019-07-10 02:46:08,859 epoch 1 - iter 10538/52697 - loss 1.11165910
2019-07-10 02:54:30,411 epoch 1 - iter 15807/52697 - loss 1.09069902
2019-07-10 03:02:53,373 epoch 1 - iter 21076/52697 - loss 1.07789306
2019-07-10 03:11:14,741 epoch 1 - iter 26345/52697 - loss 1.06892765
2019-07-10 03:19:37,729 epoch 1 - iter 31614/52697 - loss 1.06106354
2019-07-10 03:27:59,663 epoch 1 - iter 36883/52697 - loss 1.05375273
2019-07-10 03:36:22,962 epoch 1 - iter 42152/52697 - loss 1.04840894
201

In [0]:
word_embeddings = [ELMoEmbeddings('original')]
flair_model(word_embeddings)

100%|██████████| 336/336 [00:00<00:00, 82087.96B/s]
100%|██████████| 374434792/374434792 [00:09<00:00, 40326719.09B/s]


MODEL METADATA:
    hidden_size=512 | learning_rate=0.1 | max_epochs=5


2019-07-10 09:41:40,652 ----------------------------------------------------------------------------------------------------
2019-07-10 09:41:40,653 Evaluation method: MICRO_F1_SCORE
2019-07-10 09:41:41,070 ----------------------------------------------------------------------------------------------------
2019-07-10 09:41:41,975 epoch 1 - iter 0/52697 - loss 2.76070738
2019-07-10 09:50:12,604 epoch 1 - iter 5269/52697 - loss 1.46818150
2019-07-10 09:58:37,828 epoch 1 - iter 10538/52697 - loss 1.37977914
2019-07-10 10:07:02,311 epoch 1 - iter 15807/52697 - loss 1.34140393
2019-07-10 10:15:29,523 epoch 1 - iter 21076/52697 - loss 1.31820293
2019-07-10 10:23:53,804 epoch 1 - iter 26345/52697 - loss 1.30037777
2019-07-10 10:32:19,894 epoch 1 - iter 31614/52697 - loss 1.28788041
