<a href="https://colab.research.google.com/github/Alexjmsherman/nlp_practicum_cohort3_instructor/blob/master/lessons/lesson_6_deep_learning_for_nlp/Deep%20Learning%20for%20NLP.%202ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Deep Learning for NLP

##### Author: Alex Sherman | alsherman@deloitte.com


Agenda:


In [0]:
import os
import numpy as np
import pandas as pd
from numpy import array
from numpy import asarray
from numpy import zeros
from IPython.core.display import display, HTML
from IPython.display import Image
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import seaborn as sns
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors
from zipfile import ZipFile 

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, \
    Embedding, Input, GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D, GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.initializers import Constant
from keras.utils import to_categorical

Using TensorFlow backend.


# Problem Definition

Predict the National Institutes of Health (NIH) Institute of Center (IC) from the Project Title of previously funded projects

In [1]:
# download NIH Project Data
!wget https://exporter.nih.gov/CSVs/final/RePORTER_PRJ_C_FY2017.zip

--2019-06-16 00:54:42--  https://exporter.nih.gov/CSVs/final/RePORTER_PRJ_C_FY2017.zip
Resolving exporter.nih.gov (exporter.nih.gov)... 165.112.228.197, 2607:f220:404:1101:165:112:228:197
Connecting to exporter.nih.gov (exporter.nih.gov)|165.112.228.197|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58433518 (56M) [application/x-zip-compressed]
Saving to: ‘RePORTER_PRJ_C_FY2017.zip’


2019-06-16 00:56:03 (710 KB/s) - ‘RePORTER_PRJ_C_FY2017.zip’ saved [58433518/58433518]



In [0]:
REPORTER_DATA_PATH = r'RePORTER_PRJ_C_FY2017.zip'

# view the data
df = pd.read_csv(
    REPORTER_DATA_PATH,
    encoding='latin-1'  # common encoding to handle messy data
)

# filter to relevant columns
df = df[['ADMINISTERING_IC', 'FY',  'IC_NAME', 'PROJECT_TITLE']]

# convert IC counts to a dataframe
top_ic = df['IC_NAME'].value_counts().reset_index()

# filter to top ICs
top_ic_names = top_ic[top_ic.IC_NAME > 1500]['index']

# view new data subset
df = df[df['IC_NAME'].isin(top_ic_names)]

# set the labels as a new column
df['IC_NUM'] = df['ADMINISTERING_IC'].factorize()[0]

# create a map of IC nums to names for later reference
ic_name_map = {num:name for num, name in df[['IC_NUM','IC_NAME']].drop_duplicates().values}

# Preprocess data and create Train/Test splits

In [0]:
def preprocess_text(text):
    """ use gensim simple_preprocess to tokenize text. Join results back into a clean text string
    
    :param text: string, text to preprocess
    """
    
    clean_tokens = simple_preprocess(text)
    clean_text = ' '.join(clean_tokens)
    
    return clean_text

In [0]:
# separate the features and response
X = df['PROJECT_TITLE'].apply(lambda x: preprocess_text(x))
y = df['IC_NUM']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42)

# get a count of the number of possible categories to predict
num_classes = len(set(y_train))

# convert the training and testing dataset
y_train_array = to_categorical(y_train, num_classes)
y_test_array = to_categorical(y_test, num_classes)

# Full Embedding and Model Pipeline Class

In [0]:
class EmbeddingModel:
    
    def __init__(self, X_train, X_test, y_train, y_test, 
                 max_num_words=20000, max_seq_length=50, embedding_size=50, embedding_dir=None):
                
        # set tokenizer params
        self.max_num_words = max_num_words
        self.max_seq_length = max_seq_length
        self.vocab_size = None
        self.tokenizer = None

        # format data
        self.num_classes = len(set(y_train))        
        self.y_train_array = to_categorical(y_train, self.num_classes)
        self.y_test_array = to_categorical(y_test, self.num_classes)
        self.X_train = X_train
        self.X_test = X_test
        self.X_train_sequence = self.encode_text(X_train, train=True)
        self.X_test_sequence = self.encode_text(X_test, train=False)
                
        # set embedding params
        self.embedding_dir = embedding_dir
        self.embedding_size = embedding_size
        self.embeddings_index = None
        self.embedding_matrix = None
        
        # set model params
        self.model = None
    
    def setup_model_pipeline(self):
        self.create_embeddings_index()
        self.create_embedding_matrix()
        print('model pipeline set-up complete')

    def encode_text(self, text, train=False):
        if train:
            self.tokenizer = Tokenizer(num_words=self.max_num_words)
            self.tokenizer.fit_on_texts(text)

        encoded_docs = self.tokenizer.texts_to_sequences(text)
        padded_docs = pad_sequences(
            encoded_docs,
            maxlen=self.max_seq_length,
            padding='post'
        )

        print(f'completed tokenizing and padding text - train: {train}')
        self.vocab_size = len(self.tokenizer.word_index) + 1
        return padded_docs

    def create_embeddings_index(self):
        embeddings_index = {}

        with open(self.embedding_dir, 'rb') as f:
            for line in f:
                values = line.split()
                word = values[0].decode('utf-8')
                embedding = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = embedding

        print('completed creating embedding index')
        self.embeddings_index = embeddings_index

    def create_embedding_matrix(self):
        embedding_matrix = zeros((self.vocab_size, self.embedding_size))

        for word, i in self.tokenizer.word_index.items():    
            embedding_vector = self.embeddings_index.get(word)

            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        print('completed creating embedding matrix')
        self.embedding_matrix = embedding_matrix

    def get_embedding_layer(self):
        embedding = Embedding(
            input_dim=self.vocab_size, 
            output_dim=self.embedding_size,                                    
            input_length=self.max_seq_length,
            embeddings_initializer=Constant(self.embedding_matrix),
            trainable=False                                   
        )

        return embedding

    def fit(self, model=None, epochs=10):
        if model:
            print('using custom model')
        else:
            # default model if a custom model is not provided
            model = Sequential()
            model.add(self.get_embedding_layer())
            model.add(Flatten())
            model.add(Dense(self.num_classes, activation='softmax'))
            model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
        
        # train model
        model.fit(
            self.X_train_sequence, 
            self.y_train_array,
            epochs=epochs,
            validation_data=(self.X_test_sequence, self.y_test_array)
        )

        print('completed training model')
        self.model = model
        
    def predict(self, X):
        encoded_text = self.encode_text(X, train=False)
        y_pred = self.model.predict_classes(encoded_text)
        
        return y_pred

## Download Glove Embeddings

In [28]:
# uncomment below for 2GB GLoVe Embeddings
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2019-06-16 01:12:17--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2019-06-16 01:12:17--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2019-06-16 01:12:18--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [29]:
!ls

glove.840B.300d.zip  RePORTER_PRJ_C_FY2017.zip	sample_data


#### Prepare Glove Embedding Matrix

In [30]:
# Glove Word Embeddings
GLOVE_DIR = 'glove.840B.300d.txt'
EMBEDDING_SIZE = 300
GLOVE_FILE_NAME = "glove.840B.300d.zip"

  
# opening the zip file in READ mode 
with ZipFile(GLOVE_FILE_NAME, 'r') as z:
  
    # print all the contents of the zip file 
    z.printdir() 
  
    # extracting all the files 
    z.extractall() 
    

# Store all embeddings {'token': n-dimensional embedding_series}
embeddings_index = {}

with open(GLOVE_DIR, 'rb') as f:
    for line in f:
        values = line.split()
        word = values[0].decode('utf-8')
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print(f'Found {len(embeddings_index)} word vectors.')

# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, EMBEDDING_SIZE))

for word, i in tokenizer.word_index.items():    
    embedding_vector = embeddings_index.get(word)
    
    # add each word in the embedding_matrix in the slot for the tokenizer's word id
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

File Name                                             Modified             Size
glove.840B.300d.txt                            2015-10-24 10:35:30   5646236541
Found 2196016 word vectors.


In [38]:
!rm -rf glove.840B.300d.zip
!ls

BioASQword2vec	glove.840B.300d.txt  RePORTER_PRJ_C_FY2017.zip	sample_data


## Test Embedding Model with Glove Embeddings

In [54]:
# instantiate nlp model pipeline
embedding_model = EmbeddingModel(
    X_train=X_train, 
    X_test=X_test, 
    y_train=y_train, 
    y_test=y_test,
    max_num_words=10000,
    max_seq_length=75,
    embedding_dir=GLOVE_DIR,
    embedding_size=EMBEDDING_SIZE
)

# set-up and train model
embedding_model.setup_model_pipeline()

completed tokenizing and padding text - train: True
completed tokenizing and padding text - train: False
completed creating embedding index
completed creating embedding matrix
model pipeline set-up complete


In [55]:
embedding_model.fit(epochs=10)

Train on 46022 samples, validate on 15341 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
completed training model


# Use content specific Embeddings

##### bioasq

"We applied word2vec to a corpus of 10,876,004 English abstracts of biomedical articles from PubMed. The resulting vectors of 1,701,632 distinct words (types) are now publicly available from http://bioasq.lip6.fr/tools/BioASQword2vec/. File size: 1.3GB (compressed), 3.5GB (uncompressed)."

SOURCE: http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts

In [50]:
%%time

!wget http://bioasq.lip6.fr/tools/BioASQword2vec  # download bio word embeddings
!mv BioASQword2vec BioASQword2vec.tar.gz          # update the downloaded file to the correct .tag.gz name
!tar -xvzf BioASQword2vec.tar.gz                  # unzip the file
!ls

word2vecTools/toolkit.py
word2vecTools/vectors.txt
word2vecTools/
word2vecTools/README_BioASQ_word_vectors.pdf
word2vecTools/types.txt
word2vecTools/train_vectors.sh
BioASQword2vec.tar.gz  RePORTER_PRJ_C_FY2017.zip  word2vecTools
glove.840B.300d.txt    sample_data
CPU times: user 296 ms, sys: 365 ms, total: 662 ms
Wall time: 49.4 s


In [0]:
class BioasqEmbeddingModel(EmbeddingModel):

    # override the EmbeddingModel's create_embeddings_index to read in bioasq embeddings
    def create_embeddings_index(self):

        # read in a file with all the learned tokens
        with open(r'word2vecTools/types.txt', 'r') as f:
            tokens = [line.strip() for line in f]

        # read in a file with the associated embeddings for the tokens
        with open('word2vecTools/vectors.txt', 'rb') as f:
            embeddings = [np.asarray(embedding.split(), dtype='float32') for embedding in f]

        # create a dict of the word --> embedding mappings
        embeddings_index = {word:embedding for word, embedding in zip(tokens, embeddings)}

        print('completed creating pubmed embedding index')
        self.embeddings_index = embeddings_index

In [52]:
%%time

# set-up model pipeline
bioasq_model = BioasqEmbeddingModel(
    X_train=X_train, 
    X_test=X_test, 
    y_train=y_train,
    y_test=y_test,
    max_num_words=20000,
    max_seq_length=75,
    embedding_size=200
)
bioasq_model.setup_model_pipeline()

completed tokenizing and padding text - train: True
completed tokenizing and padding text - train: False
completed creating pubmed embedding index
completed creating embedding matrix
model pipeline set-up complete
CPU times: user 59.9 s, sys: 3.33 s, total: 1min 3s
Wall time: 1min 3s


In [53]:
%%time

# default model if a custom model is not provided
model = Sequential()
model.add(bioasq_model.get_embedding_layer())
model.add(Flatten())
model.add(Dense(bioasq_model.num_classes, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# train model
bioasq_model.fit(epochs=10, model=model)

using custom model
Train on 46022 samples, validate on 15341 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
completed training model
CPU times: user 59.2 s, sys: 5.79 s, total: 1min 4s
Wall time: 45.8 s


##### Biomedical natural language processing (Pubmed, PMC, and Wikipedia combined embeddings)

NOTE: This embedding file is 4GB

"The openly available biomedical literature contains over 5 billion words in publication abstracts and full texts. Recent advances in unsupervised language processing methods have made it possible to make use of such large unannotated corpora for building statistical language models and inducing high quality vector space representations, which are, in turn, of utility in many tasks such as text classification, named entity recognition and query expansion. In this study, we introduce the first set of such language resources created from analysis of the entire available biomedical literature, including a dataset of all 1- to 5-grams and their probabilities in these texts and new models of word semantics. We discuss the opportunities created by these resources and demonstrate their application. All resources introduced in this study are available under open licenses at http://bio.nlplab.org."

SOURCE: http://bio.nlplab.org/#word-vector-tools
PUBLICATION: http://bio.nlplab.org/pdf/pyysalo13literature.pdf

In [56]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
!wget http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin  # download embeddings
!cp wikipedia-pubmed-and-PMC-w2v.bin gdrive/My\ Drive                                  # move embeddings to personal Google Drive (to avoid large repeated download)

--2019-06-14 15:05:14--  http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin
Resolving evexdb.org (evexdb.org)... 130.232.253.44
Connecting to evexdb.org (evexdb.org)|130.232.253.44|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4416560851 (4.1G) [application/octet-stream]
Saving to: ‘wikipedia-pubmed-and-PMC-w2v.bin’


2019-06-14 16:15:50 (1018 KB/s) - ‘wikipedia-pubmed-and-PMC-w2v.bin’ saved [4416560851/4416560851]



In [0]:
class PubmedEmbeddingModel(EmbeddingModel):

    # override the EmbeddingModel's create_embeddings_index to read in pubmed embeddings
    def create_embeddings_index(self):

        embedding_path = r'wikipedia-pubmed-and-PMC-w2v.bin'
        word_vectors = KeyedVectors.load_word2vec_format(embedding_path, binary=True)

        # create a dict of the word --> embedding mappings
        embeddings_index = {word: word_vectors.get_vector(word) for word in word_vectors.index2word}

        print('completed creating pubmed embedding index')
        self.embeddings_index = embeddings_index

In [0]:
%%time

# set-up model pipeline
pubmed_model = PubmedEmbeddingModel(
    X_train=X_train, 
    X_test=X_test, 
    y_train=y_train,
    y_test=y_test,
    max_num_words=25000,
    max_seq_length=75,
    embedding_size=200
)
pubmed_model.setup_model_pipeline()

completed tokenizing and padding text - train: True
completed tokenizing and padding text - train: False


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


completed creating pubmed embedding index
completed creating embedding matrix
model pipeline set-up complete
CPU times: user 1min 12s, sys: 6.74 s, total: 1min 19s
Wall time: 1min 19s


In [0]:
%%time

pubmed_model.fit(epochs=15)

Train on 46022 samples, validate on 15341 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
completed training model
CPU times: user 1min 22s, sys: 7.99 s, total: 1min 30s
Wall time: 1min 4s


# 4. build a deep learning model (e.g. convolutional neural network), ending in a softmax output.

### Convolutional Neural Networ (CNN)

In [0]:
# define model

def build_cnn_model(model):
    cnn_model = Sequential()
    cnn_model.add(model.get_embedding_layer())

    cnn_model.add(Conv1D(512, 1, activation='relu'))
    cnn_model.add(MaxPooling1D(4))
    cnn_model.add(Dropout(.5))

    cnn_model.add(Conv1D(256, 1, activation='relu'))
    cnn_model.add(MaxPooling1D(1))
    cnn_model.add(Dropout(.4))

    cnn_model.add(Conv1D(128, 1, activation='relu'))
    cnn_model.add(MaxPooling1D(1))
    cnn_model.add(Dropout(.3))

    cnn_model.add(Conv1D(64, 1, activation='relu'))
    cnn_model.add(MaxPooling1D(1))
    cnn_model.add(Dropout(.2))

    
    cnn_model.add(GlobalAveragePooling1D())

    cnn_model.add(Dense(units=64, activation='relu'))
    cnn_model.add(Dense(units=64, activation='relu'))

    cnn_model.add(Dense(model.num_classes, activation='softmax'))
    cnn_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return cnn_model

In [62]:
# set-up and train model
cnn_model = build_cnn_model(embedding_model)
embedding_model.fit(epochs=15, model=cnn_model)

W0616 01:49:31.710656 140041243158400 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0616 01:49:31.735905 140041243158400 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


using custom model
Train on 46022 samples, validate on 15341 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
completed training model


In [63]:
cnn_model = build_cnn_model(bioasq_model)
bioasq_model.fit(epochs=15, model=cnn_model)

using custom model
Train on 46022 samples, validate on 15341 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
completed training model


In [0]:
cnn_model = build_cnn_model(pubmed_model)
pubmed_model.fit(epochs=15, model=cnn_model)

NameError: ignored

### LSTM

In [0]:
# define model
def build_lstm(model):
    lstm_model = Sequential()
    lstm_model.add(model.get_embedding_layer())
    lstm_model.add(LSTM(64))
    lstm_model.add(Dense(model.num_classes, activation='softmax'))
    lstm_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return lstm_model

In [0]:
# set-up and train model
lstm_model = build_lstm(embedding_model)
embedding_model.fit(epochs=4, model=lstm_model)

using custom model
Train on 46022 samples, validate on 15341 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
completed training model


In [0]:
# set-up and train model
lstm_model = build_lstm(bioasq_model)
bioasq_model.fit(epochs=25, model=lstm_model)

NameError: ignored

In [0]:
lstm_model = build_lstm(pubmed_model)
pubmed_model.fit(epochs=25, model=lstm_model)

using custom model
Train on 46022 samples, validate on 15341 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
completed training model


# FLAIR 

#### NOTE: Due to Colab 12GB memory constraint, you should probably reset your environment before running the below code or it will likely crash

In [1]:
!pip3 install https://download.pytorch.org/whl/cpu/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
!pip3 install https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
!pip install flair
!pip install allennlp

Collecting botocore==1.12.169 (from awscli>=1.11.91->allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/28/ac/a43d37f371f5854514128d7c54887176b8df3bc9925a25e5096298033f93/botocore-1.12.169-py2.py3-none-any.whl (5.5MB)
[K     |████████████████████████████████| 5.6MB 9.1MB/s 
Installing collected packages: botocore
  Found existing installation: botocore 1.12.165
    Uninstalling botocore-1.12.165:
      Successfully uninstalled botocore-1.12.165
Successfully installed botocore-1.12.169


In [0]:
from flair.data import Corpus
from flair.datasets import ClassificationCorpus
from flair.data import Corpus
from flair.datasets import TREC_6
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
import pandas as pd

# this is the folder in which train, test and dev files reside
data_folder = '/tmp'

## Download Data formatted for FLAIR

In [0]:
def load_data(url, fname):
  """ load data from GitHub for FLAIR
  
  :param url: string, GitHub url with data to load
  :param fname: string, output file name to write loaded data
  """

  # open the data from GitHub into a dataframe
  df = pd.read_csv(url, encoding='latin-1', sep='\t')

  # write to a new file
  with open(fname, 'w') as f:  

    for ind, line in enumerate(df.values):
      # skip empty lines
      if len(line[0].strip().split(' ')) < 2:
         continue
      
      # do not add a new line at the end of the file
      if ind == df.shape[0] - 1:
         f.write(line[0])  
      else:
         f.write(line[0] + '\n')
  
  print(f'loaded data: {fname}')

In [7]:
# TRAIN
url = r'https://raw.githubusercontent.com/Alexjmsherman/nlp_practicum_cohort3_instructor/master/lessons/lesson_6_deep_learning_for_nlp/train.txt?token=ABXRUPUPYBPGSGSNICM66JC5B3PPM'
fname ='/tmp/train.txt'
load_data(url, fname)

# DEV
url = r'https://raw.githubusercontent.com/Alexjmsherman/nlp_practicum_cohort3_instructor/master/lessons/lesson_6_deep_learning_for_nlp/dev.txt?token=ABXRUPUSWL2K5JS453CAQYC5B3PLS'
fname ='/tmp/dev.txt'
load_data(url, fname)

# TEST
url = r'https://raw.githubusercontent.com/Alexjmsherman/nlp_practicum_cohort3_instructor/master/lessons/lesson_6_deep_learning_for_nlp/test.txt?token=ABXRUPR2ZCYZCH6AUMRHTBC5B3PPS'
fname = '/tmp/test.txt'
load_data(url, fname)

loaded data: /tmp/train.txt
loaded data: /tmp/dev.txt
loaded data: /tmp/test.txt


In [8]:
# 1. load corpus containing training, test and dev data
corpus: Corpus = ClassificationCorpus(data_folder)
  
corpus

2019-06-16 02:14:09,389 Reading data from /tmp
2019-06-16 02:14:09,390 Train: /tmp/train.txt
2019-06-16 02:14:09,391 Dev: /tmp/dev.txt
2019-06-16 02:14:09,392 Test: /tmp/test.txt


<flair.datasets.ClassificationCorpus at 0x7fb6a7ffcc18>

In [9]:
# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

2019-06-16 02:14:16,672 {'<6>', '<5>', '<11>', '<9>', '<2>', '<10>', '<7>', '<1>', '<0>', '<3>', '<8>', '<4>', '<12>'}


In [0]:
# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('glove'),

                   # comment in flair embeddings for state-of-the-art results
                   # FlairEmbeddings('news-forward'),
                   # FlairEmbeddings('news-backward'),
                   ]

# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256
)

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

# 7. start the training
trainer.train('resources/taggers/ag_news',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=5)

# 8. plot training curves (optional)
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('resources/taggers/ag_news/loss.tsv')
plotter.plot_weights('resources/taggers/ag_news/weights.txt')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2019-06-16 02:14:33,154 ----------------------------------------------------------------------------------------------------
2019-06-16 02:14:33,155 Evaluation method: MICRO_F1_SCORE
2019-06-16 02:14:33,716 ----------------------------------------------------------------------------------------------------
2019-06-16 02:14:34,386 epoch 1 - iter 0/1079 - loss 2.71045065
2019-06-16 02:14:37,317 epoch 1 - iter 107/1079 - loss 2.22840287
2019-06-16 02:14:40,508 epoch 1 - iter 214/1079 - loss 2.12121158
2019-06-16 02:14:43,758 epoch 1 - iter 321/1079 - loss 2.06519325
2019-06-16 02:14:46,507 epoch 1 - iter 428/1079 - loss 2.02248667
2019-06-16 02:14:49,758 epoch 1 - iter 535/1079 - loss 1.98986950
2019-06-16 02:14:53,035 epoch 1 - iter 642/1079 - loss 1.95880347
2019-06-16 02:14:55,808 epoch 1 - iter 749/1079 - loss 1.93211463
2019-06-16 02:14:59,053 epoch 1 - iter 856/1079 - loss 1.90895802
2019-06-16 02:15:02,228 epoch 1 - iter 963/1079 - loss 1.88603325
2019-06-16 02:15:04,926 epoch 1 - 

In [0]:
# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('glove'),

                   # comment in flair embeddings for state-of-the-art results
                   # FlairEmbeddings('news-forward'),
                   # FlairEmbeddings('news-backward'),
                   ]


def word_embeddings(word_embeddings):
    # 4. initialize document embedding by passing list of word embeddings
    # Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256
    )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

    # 6. initialize the text classifier trainer
    trainer = ModelTrainer(classifier, corpus)

    # 7. start the training
    trainer.train('resources/taggers/ag_news',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=5,
                  max_epochs=5)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves('resources/taggers/ag_news/loss.tsv')
    plotter.plot_weights('resources/taggers/ag_news/weights.txt')