## Training a Neural Entity Extractor using PubMed Word Embeddings

### Step 1

#### Copy the Embeddings from source location to destination location <br>

In [None]:
# From Azure ML CLI
# (1) upgrade tensorflow to the GPU version
# !conda install tensorflow-gpu
# (2) install fastparquet
# !conda install fastparquet

In [41]:
import tensorflow
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
tensorflow.device('/gpu:0')

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14594599266161431555
, name: "/gpu:0"
device_type: "GPU"
memory_limit: 174063616
locality {
  bus_id: 1
}
incarnation: 14645120121624889074
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 1852:00:00.0"
]


<contextlib._GeneratorContextManager at 0x10d2f625438>

#### Generate the Embedding Matrix from parquet files on the Container linked to your Spark Cluster

In [2]:
#Generate the Embedding Matrix by reading the TSV files
from azure.storage.blob import BlockBlobService
import numpy as np
import datetime

import os
import shutil
import fastparquet
import pickle
import os 

window_size = 5
vector_size = 50
min_count =400

home_dir = "C:\dl4nlp"
print("home_dir = {}".format(home_dir))

#Specify the string to look for in blob names from your container
embedding_relative_path = "Models/word2vec_pubmed_model_vs_{}_ws_{}_mc_{}_parquet_files".\
    format(vector_size, window_size, min_count)
        
data_folder = "Data/Drugs_and_Diseases/"
train_file_path = os.path.join(data_folder, "train_out.txt")
test_file_path = os.path.join(data_folder, "test.txt")

#Azure BLOB Storage account information
storage_account_name = '76f8577bf451dsvm'
storage_account_key ='5DPDh+p3Xbg9BfS9d/OSrtQ/Utrat1Rr/NRrGU+x3cRYPZYi6B92WEWUIkM28Z8cGRsRz0cuSGb2mjyBCB0QXg=='
storage_container_name ='dl4nlp-container'
timestart = datetime.datetime.now()

block_blob_service = BlockBlobService(account_name = storage_account_name, account_key = storage_account_key)

#Specify the path where to store the downloaded files

embedding_full_path = os.path.join(home_dir, embedding_relative_path)
print("embedding_full_path= {}".format(embedding_full_path))

if os.path.exists(embedding_full_path):
    shutil.rmtree(embedding_full_path)

os.makedirs(embedding_full_path)
        
num_parquet_files = 0
generator = block_blob_service.list_blobs(storage_container_name)
for blob in generator:      
    if embedding_relative_path in blob.name and blob.name.endswith(".parquet"):              
        num_parquet_files = num_parquet_files +1
        filename = blob.name.split("/")[-1]
        block_blob_service.get_blob_to_path(storage_container_name, blob.name, os.path.join(embedding_full_path,filename))      
        
print ("Reading {} parquet files".format(num_parquet_files))
timeend = datetime.datetime.now()
timedelta = round((timeend-timestart).total_seconds() / 60, 2)
print ("Time taken to execute above cell: " + str(timedelta) + " mins")

home_dir = C:\dl4nlp
embedding_full_path= C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files
Reading 1000 parquet files
Time taken to execute above cell: 1.48 mins


Save the embedding matrix into a pickle file

In [3]:
import pandas 
import datetime
timestart = datetime.datetime.now()

print ("Embedding vector size =", vector_size)

embedding_pickle_file = os.path.join(home_dir, "Models/w2vmodel_pubmed_vs_{}_ws_{}_mc_{}.pkl" \
 .format(vector_size, window_size, min_count))

Word2Vec_Model = {}

print("Reading the Parquet embedding files ....")
files = os.listdir(embedding_full_path)
for index, filename in enumerate(files):
    if "part" in filename:        
        parquet_file_path = os.path.join(embedding_full_path,filename)
        print("reading {}".format(parquet_file_path))

        try:
            pfile = fastparquet.ParquetFile(parquet_file_path) 
            # convert to pandas dataframe
            df =  pfile.to_pandas()    
#             df = pandas.read_csv(tsv_full_path, sep='\t')
        
            #print(df.head())    
            arr = list(df.values)                 
            for ind, vals in enumerate(arr):
                word = vals[0]
                word_vec = vals[-vector_size:]
                word_vec = np.array(word_vec)
                Word2Vec_Model[word] = word_vec.astype('float32')
        except:
            print("Skip {}".format(filename))
            
#save the embedding matrix into a pickle file
print("save the embedding matrix of {} entries into a pickle file".format(len(Word2Vec_Model)))
pickle.dump(Word2Vec_Model, open(embedding_pickle_file, "wb")) 
    
timeend = datetime.datetime.now()
timedelta = round((timeend-timestart).total_seconds() / 60, 2)
print ("Time taken to execute above cell: " + str(timedelta) + " mins")

Embedding vector size = 50
Reading the Parquet embedding files ....
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00000-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00001-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00002-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00003-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00004-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00005-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00006-858f31b5-52a6-

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00062-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00063-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00064-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00065-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00066-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00067-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00068-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00124-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00125-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00126-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00127-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00128-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00129-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00130-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00186-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00187-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00188-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00189-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00190-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00191-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00192-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00247-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00248-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00249-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00250-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00251-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00252-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00253-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00307-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00308-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00309-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00310-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00311-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00312-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00313-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00369-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00370-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00371-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00372-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00373-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00374-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00375-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00431-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00432-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00433-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00434-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00435-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00436-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00437-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00490-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00491-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00492-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00493-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00494-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00495-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00496-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00551-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00552-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00553-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00554-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00555-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00556-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00557-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00611-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00612-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00613-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00614-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00615-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00616-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00617-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00674-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00675-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00676-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00677-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00678-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00679-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00680-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00736-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00737-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00738-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00739-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00740-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00741-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00742-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00799-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00800-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00801-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00802-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00803-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00804-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00805-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00859-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00860-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00861-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00862-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00863-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00864-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00865-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00920-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00921-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00922-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00923-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00924-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00925-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00926-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00983-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00984-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00985-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00986-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00987-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00988-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_pubmed_model_vs_50_ws_5_mc_400_parquet_files\part-00989-858f31b5-52a6-49cf-b541-9fc814fb1662.gz.parquet
reading C:\dl4nlp\Models/word2vec_

#### Copy the Training Data, Testing Data, Evaluation Script to destination location

In [4]:
len(Word2Vec_Model)

126855

In [10]:
from azure.storage.blob import BlockBlobService
import os
block_blob_service = BlockBlobService(account_name = storage_account_name, account_key = storage_account_key)

generator = block_blob_service.list_blobs(storage_container_name)

if not os.path.exists(os.path.join(home_dir, data_folder)):
    os.makedirs(os.path.join(home_dir, data_folder))
    
local_train_file_path  = os.path.join(home_dir, train_file_path)
local_test_file_path  = os.path.join(home_dir, test_file_path)

block_blob_service.get_blob_to_path(storage_container_name, train_file_path, local_train_file_path)
block_blob_service.get_blob_to_path(storage_container_name, test_file_path, local_test_file_path)


<azure.storage.blob.models.Blob at 0x23f75fbb748>

#### Prepare the Training and Testing Data in the correct format for Keras

In [28]:
# %%writefile Data_Preparation2.py
from keras.preprocessing import sequence
import numpy as np
#Pytnon 2
#import cPickle as cpickle

#Pytnon 3
import _pickle as cPickle

class DataPreparation:

#     def __init__ (self, num_classes, seq_length, train_file=None, test_file=None, embeddings_file=None, vector_size = 100):
    def __init__ (self, num_classes, seq_length, embeddings_file=None, vector_size = 100):
        # Some constants
        self.DEFAULT_N_CLASSES = num_classes
        self.DEFAULT_N_FEATURES = vector_size
        self.DEFAULT_MAX_SEQ_LENGTH = seq_length
        
        # Other stuff
        self.wordvecs = None
        self.word_to_ix_map = {}
        self.n_features = 0
        self.n_tag_classes = 0
        self.n_sentences_all = 0
        self.tag_vector_map = {}
        
        self.max_sentence_len_train = 0
        self.max_sentence_len_test = 0
        self.max_sentence_len = 0
        
#         self.all_X_train = []
#         self.all_Y_train = []
#         self.all_X_test = []
#         self.all_Y_test = []
        self.unk_words = []
        
        self.load_embedding_lookup_table(embeddings_file)
#         self.read_and_parse_data(train_file, test_file, embeddings_file)
            
#     def get_data (self):
#         return (self.all_X_train, self.all_Y_train, \
#                 self.all_X_test, self.all_Y_test, \
#                 self.wordvecs)
    
    def decode_prediction_sequence (self, pred_seq):
        
        pred_tags = []
        for class_prs in pred_seq:
            class_vec = np.zeros(self.DEFAULT_N_CLASSES, dtype=np.int32)
            class_vec[np.argmax(class_prs)] = 1
            if tuple(class_vec.tolist()) in self.tag_vector_map:
                pred_tags.append(self.tag_vector_map[tuple(class_vec.tolist())])
            else:
                print(tuple(class_vec.tolist()))
        return pred_tags
    
    def load_embedding_lookup_table (self, embeddings_file):
        
        ###Load the Word2Vec Model###
        print("Loading the W2V model from file {}".format(embeddings_file))
        #W2V_model = cPickle.load(open(embeddings_file, "rb"))
        with open(embeddings_file, 'rb') as f:
            W2V_model = cPickle.load(f, encoding='bytes')                     
            
        vocab = list(W2V_model.keys())       
        
        self.word_to_ix_map = {}
        self.wordvecs = []
        
        ###Create LookUp Table for words and their word vectors###
        print("Creating the lookup table")
        for index, word in enumerate(vocab):
            self.word_to_ix_map[word] = index
            self.wordvecs.append(W2V_model[vocab[index]])
            
           
        self.wordvecs = np.array(self.wordvecs)
        print("Number of entries in the lookup table = {}".format(len(self.wordvecs)))
        self.n_features = len(self.wordvecs[0])
        print("embedding size = {}".format(self.n_features))
        
        # Add a zero vector for the Paddings
        self.wordvecs = np.vstack((self.wordvecs, np.zeros(self.DEFAULT_N_FEATURES)))
        zero_vec_pos = self.wordvecs.shape[0] - 1
        
        print("Done")
        return (self.wordvecs)
    
    
    
    ##########################  READ TRAINING DATA  ######################### 
    def read_and_parse_training_data (self, train_file, skip_unknown_words = False):
        
        print("Loading the training data from file {}".format(train_file))
        with open(train_file, 'r') as f_train:
            
            self.n_tag_classes = self.DEFAULT_N_CLASSES
            self.tag_vector_map = {}    # For storing one hot vector notation for each Tag
            tag_class_id = 0            # Used to put 1 in the one hot vector notation
            raw_data_train = []
            raw_words_train = []
            raw_tags_train = []        

            # Process all lines in the file
            for line in f_train:
                line = line.strip()
                if not line:
                    raw_data_train.append( (tuple(raw_words_train), tuple(raw_tags_train)))
                    raw_words_train = []
                    raw_tags_train = []
                    continue
                
                word, tag = line.split('\t')
                
                raw_words_train.append(word)
                raw_tags_train.append(tag)
                
                if tag not in self.tag_vector_map:
                    one_hot_vec = np.zeros(self.DEFAULT_N_CLASSES, dtype=np.int32)
                    one_hot_vec[tag_class_id] = 1
                    self.tag_vector_map[tag] = tuple(one_hot_vec)
                    self.tag_vector_map[tuple(one_hot_vec)] = tag
                    tag_class_id += 1
                    
        print("number of training examples = " + str(len(raw_data_train)))
        
        #Adding a None Tag
        one_hot_vec = np.zeros(self.DEFAULT_N_CLASSES, dtype = np.int32)
        one_hot_vec[tag_class_id] = 1
        self.tag_vector_map['NONE'] = tuple(one_hot_vec)
        self.tag_vector_map[tuple(one_hot_vec)] = 'NONE'
        tag_class_id += 1
        
        self.n_sentences_all = len(raw_data_train)

        # Find the maximum sequence length for Training data
        self.max_sentence_len_train = 0
        for seq in raw_data_train:
            if len(seq[0]) > self.max_sentence_len_train:
                self.max_sentence_len_train = len(seq[0])                
        
         ############## Create Train Vectors################
        all_X_train, all_Y_train = [], []
        
        self.unk_words = []
        count = 0
        for word_seq, tag_seq in raw_data_train:  
            
            elem_wordvecs, elem_tags = [], []            
            for ix in range(len(word_seq)):
                w = word_seq[ix]
                t = tag_seq[ix]
                w = w.lower()
                if w in self.word_to_ix_map :
                    count += 1
                    elem_wordvecs.append(self.word_to_ix_map[w])
                    elem_tags.append(self.tag_vector_map[t])

                elif "UNK" in self.word_to_ix_map :
                    elem_wordvecs.append(self.word_to_ix_map["UNK"])
                    elem_tags.append(self.tag_vector_map[t])
                
                else:
                    w = "UNK"       
                    new_wv = 2 * np.random.randn(self.DEFAULT_N_FEATURES) - 1 # sample from normal distribution
                    norm_const = np.linalg.norm(new_wv)
                    new_wv /= norm_const
                    self.wordvecs = np.vstack((self.wordvecs, new_wv))
                    self.word_to_ix_map[w] = self.wordvecs.shape[0] - 1
                    elem_wordvecs.append(self.word_to_ix_map[w])
                    elem_tags.append(list(self.tag_vector_map[t]))

            
            # Pad the sequences for missing entries to make them all the same length
            nil_X = zero_vec_pos
            nil_Y = np.array(self.tag_vector_map['NONE'])
            pad_length = self.max_sentence_len - len(elem_wordvecs)
            all_X_train.append( ((pad_length)*[nil_X]) + elem_wordvecs)
            all_Y_train.append( ((pad_length)*[nil_Y]) + elem_tags)

        all_X_train = np.array(all_X_train)
        all_Y_train = np.array(all_Y_train)
        
        print("UNK WORD COUNT = " + str(len(self.unk_words)))
        print("Found WORDS COUNT = " + str(count))
        print("TOTAL WORDS = " + str(count+len(self.unk_words)))    
        
        print("Done")
        
        return (all_X_train, all_Y_train)
    
    
    
    ##########################  READ TEST DATA  ######################### 
    def read_and_parse_test_data (self, test_file, skip_unknown_words = False):
        
        print("Loading test data from file {}".format(test_file))
        with open(test_file, 'r') as f_test:
            
            self.n_tag_classes = self.DEFAULT_N_CLASSES
            tag_class_id = 0 
            raw_data_test = []
            raw_words_test = []
            raw_tags_test = []        

            # Process all lines in the file
            for line in f_test:
                line = line.strip()
                if not line:
                    raw_data_test.append( (tuple(raw_words_test), tuple(raw_tags_test)))
                    raw_words_test = []
                    raw_tags_test = []
                    continue
                
                word, tag = line.split('\t') 
                
                if tag not in self.tag_vector_map:
                    print("added")
                    one_hot_vec = np.zeros(self.DEFAULT_N_CLASSES, dtype=np.int32)
                    one_hot_vec[tag_class_id] = 1
                    self.tag_vector_map[tag] = tuple(one_hot_vec)
                    self.tag_vector_map[tuple(one_hot_vec)] = tag
                    tag_class_id += 1
                
                raw_words_test.append(word)
                raw_tags_test.append(tag)                
                                    
        print("number of test examples = " + str(len(raw_data_test)))
        self.n_sentences_all = len(raw_data_test)

#         # Find the maximum sequence length for Test Data
#         self.max_sentence_len_test = 0
#         for seq in raw_data_test:
#             if len(seq[0]) > self.max_sentence_len_test:
#                 self.max_sentence_len_test = len(seq[0])
                
#         #Find the maximum sequence length in both training and Testing dataset
#         self.max_sentence_len = max(self.max_sentence_len_train, self.max_sentence_len_test)               
        
        ########################Create TEST Feature Vectors##########################
        self.all_X_test, self.all_Y_test = [], []
        
        for word_seq, tag_seq in raw_data_test:  
            
            elem_wordvecs, elem_tags = [], []            
            for ix in range(len(word_seq)):
                w = word_seq[ix]
                t = tag_seq[ix]
                w = w.lower()
                if w in self.word_to_ix_map:
                    count += 1
                    elem_wordvecs.append(self.word_to_ix_map[w])
                    elem_tags.append(self.tag_vector_map[t])
                    
                elif "UNK" in self.word_to_ix_map :
                    self.unk_words.append(w)
                    elem_wordvecs.append(self.word_to_ix_map["UNK"])
                    elem_tags.append(self.tag_vector_map[t])
                    
                else:
                    self.unk_words.append(w)
                    w = "UNK"
                    self.word_to_ix_map[w] = self.wordvecs.shape[0] - 1
                    elem_wordvecs.append(self.word_to_ix_map[w])
                    elem_tags.append(self.tag_vector_map[t])
                
            # Pad the sequences for missing entries to make them all the same length
            nil_X = zero_vec_pos
            nil_Y = np.array(self.tag_vector_map['NONE'])
            pad_length = self.max_sentence_len - len(elem_wordvecs)
            all_X_test.append( ((pad_length)*[nil_X]) + elem_wordvecs)
            all_Y_test.append( ((pad_length)*[nil_Y]) + elem_tags)

        all_X_test = np.array(all_X_test)
        all_Y_test = np.array(all_Y_test)
        
        print("UNK WORD COUNT = " + str(len(self.unk_words)))
        print("Found WORDS COUNT = " + str(count))
        print("TOTAL WORDS = " + str(count+len(self.unk_words)))         
        
        print("Done")
        
        return (all_X_test, all_Y_test)
                                         
        
        
    ##########################  READ UNLABELED DATA  ######################### 
    def read_and_parse_unlabeled_data (self, data_file, skip_unknown_words = False):
        
        print("Loading unlabeled data from file {}".format(data_file))
        with open(data_file, 'r') as f_data:
            
            self.n_tag_classes = self.DEFAULT_N_CLASSES
            tag_class_id = 0 
            raw_data_test = []
            raw_words_test = []
#             raw_tags_test = []        

            # Process all lines in the file
            for line in f_data:
                line = line.strip()
                if not line:
                    #flush
                    raw_data_test.append( tuple(raw_words_test) )
                    raw_words_test = []
#                     raw_tags_test = []
                    continue
                
#                 word, tag = line.split('\t') 
                word = line
                
#                 if tag not in self.tag_vector_map:
#                     print("added")
#                     one_hot_vec = np.zeros(self.DEFAULT_N_CLASSES, dtype=np.int32)
#                     one_hot_vec[tag_class_id] = 1
#                     self.tag_vector_map[tag] = tuple(one_hot_vec)
#                     self.tag_vector_map[tuple(one_hot_vec)] = tag
#                     tag_class_id += 1
                
                raw_words_test.append(word)
#                 raw_tags_test.append(tag)                
                                    
        print("number of unlabeled examples = " + str(len(raw_data_test)))
        self.n_sentences_all = len(raw_data_test)

#         # Find the maximum sequence length for Test Data
#         self.max_sentence_len_test = 0
#         for seq in raw_data_test:
#             if len(seq[0]) > self.max_sentence_len_test:
#                 self.max_sentence_len_test = len(seq[0])
                
#         #Find the maximum sequence length in both training and Testing dataset
#         self.max_sentence_len = max(self.max_sentence_len_train, self.max_sentence_len_test)               
        
        ########################Create UNLABELED DATA Feature Vectors##########################
        all_X_data = []
        
        for word_seq in raw_data_test:  
            
            elem_wordvecs = []            
            for ix in range(len(word_seq)):
                w = word_seq[ix]
                
                w = w.lower()
                if w in self.word_to_ix_map:
                    count += 1
                    elem_wordvecs.append(self.word_to_ix_map[w])                
                    
                elif "UNK" in self.word_to_ix_map :
                    self.unk_words.append(w)
                    elem_wordvecs.append(self.word_to_ix_map["UNK"])                    
                    
                else:
                    self.unk_words.append(w)
                    w = "UNK"
                    self.word_to_ix_map[w] = self.wordvecs.shape[0] - 1
                    elem_wordvecs.append(self.word_to_ix_map[w])                    
                
            # Pad the sequences for missing entries to make them all the same length
            nil_X = zero_vec_pos
            
            pad_length = self.max_sentence_len - len(elem_wordvecs)
            all_X_data.append( ((pad_length)*[nil_X]) + elem_wordvecs)            

        all_X_data = np.array(all_X_data)        
        
        print("UNK WORD COUNT = " + str(len(self.unk_words)))
        print("Found WORDS COUNT = " + str(count))
        print("TOTAL WORDS = " + str(count+len(self.unk_words)))         
        
        print("Done")
        
        return (all_X_data)
 

#### Step 3 Create the Neural Network Model in Keras

In [29]:
# %%writefile Entity_Extractor.py
from keras.preprocessing import sequence
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers.core import Activation
from keras.regularizers import l2
from keras.layers.wrappers import TimeDistributed
from keras.layers.wrappers import Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.layers import Embedding
from keras.layers.core import Dropout
import numpy as np
import pandas as pd
import sys
import keras.backend as K
from sklearn.metrics import confusion_matrix, classification_report

# For reproducibility
np.random.seed(42)

class EntityExtractor:

    def __init__ (self, reader):
        
        self.reader = reader
        self.model = None
#         self.all_X_train, self.all_Y_train, self.all_X_test, self.all_Y_test, self.wordvecs = \
#             reader.get_data()
#         self.train_X = self.all_X_train
#         self.train_Y = self.all_Y_train
        
#         self.test_X = self.all_X_test
#         self.test_Y = self.all_Y_test
        
    def load (self, filepath):
        self.model = load_model(filepath)
        
    def save (self, filepath):
        self.model.save(filepath)

    def print_summary (self):
        print(self.model.summary())        
   
    def train (self, train_file, network_type = 'unidirectional', \
               num_epochs = 1, batch_size = 50, dropout = 0.2, reg_alpha = 0.0, \
               num_hidden_units = 150, num_layers = 1):
        
        self.train_X, self.train_Y = read_and_parse_training_data(train_file)
#         self.train_X = self.all_X_train
#         self.train_Y = self.all_Y_train
        
#         self.test_X = self.all_X_test
#         self.test_Y = self.all_Y_test

        print("Data Shape: ")
        print(train_X.shape)
        print(train_Y.shape)        
        
        dropout = 0.2                
                
        self.model = Sequential()        
        self.model.add(Embedding(self.wordvecs.shape[0], self.wordvecs.shape[1], \
                                 input_length = train_X.shape[1], \
                                 weights = [self.wordvecs], trainable = False))
        
        if network_type == 'unidirectional':
            # uni-directional LSTM
            self.model.add(LSTM(num_hidden_units, return_sequences = True))
        else:
            # bi-directional LSTM
            self.model.add(Bidirectional(LSTM(num_hidden_units, return_sequences = True)))
        
        self.model.add(Dropout(dropout))

        for i in range(1, layers):
            if network_type == 'unidirectional':
                # uni-directional LSTM
                self.model.add(LSTM(num_hidden_units, return_sequences = True))
            else:
                # bi-directional LSTM
                self.model.add(Bidirectional(LSTM(num_hidden_units, return_sequences = True)))
        
            self.model.add(Dropout(dropout))

        self.model.add(TimeDistributed(Dense(train_Y.shape[2], activation='softmax')))

        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
        print(self.model.summary())

        self.model.fit(train_X, train_Y, epochs = num_epochs, batch_size = batch_size)
                
              
    def score_model(self, data_file):
        
        data_X = read_and_parse_unlabeled_data(data_file)
        print("Data Shape: ")        
        print(data_X.shape)
        
        predicted_tags= []
        ind = 0
        for x in data_X:
            tags = self.model.predict(np.array([x]), batch_size=1)[0]
            predicted_tags = self.reader.decode_prediction_sequence(tags)
            ind += 1
            ### To see Progress ###
            if ind%500 == 0: 
                print("Sentence" + str(ind))
        
        predicted_tags = np.array(predicted_tags)
        return (predicted_tags)
    
    def evaluate_model(self, test_file):
        target = open("Pubmed_Output.txt", 'w')
        
        test_X, test_Y = read_and_parse_test_data(test_file)
        
        print("Data Shape: ")        
        print(test_X.shape)
        print(test_Y.shape)
        
        predicted_tags= []
        test_data_tags = []
        ind = 0
        for x,y in zip(test_X, test_Y):
            tags = self.model.predict(np.array([x]), batch_size=1)[0]
            pred_tags = self.reader.decode_prediction_sequence(tags)
            test_tags = self.reader.decode_prediction_sequence(y)
            ind += 1
            ### To see Progress ###
            if ind%500 == 0: 
                print("Sentence" + str(ind))

            pred_tag_wo_none = []
            test_tags_wo_none = []
            
            for index, test_tag in enumerate(test_tags):
                if test_tag != "NONE":
                    if pred_tags[index] == "B-Chemical":
                        pred_tag_wo_none.append("B-Drug")
                    elif pred_tags[index] == "I-Chemical":
                        pred_tag_wo_none.append("I-Drug")
                    elif pred_tags[index] == 'None':
                        pred_tag_wo_none.append('O')
                    else:
                        pred_tag_wo_none.append(pred_tags[index])
                        
                    if test_tag == "B-Chemical":
                        test_tags_wo_none.append("B-Drug")
                    elif test_tag == "I-Chemical":
                        test_tags_wo_none.append("I-Drug")
                    else:                        
                        test_tags_wo_none.append(test_tag)
            
            for wo in pred_tag_wo_none:
                target.write(str(wo))
                target.write("\n")
            target.write("\n")
            
            for i,j in zip(pred_tags, test_tags):
                if i != "NONE" and j != "NONE":
                    test_data_tags.append(j)
                    predicted_tags.append(i)

        target.close()
        
        predicted_tags = np.array(predicted_tags)
        test_data_tags = np.array(test_data_tags)
        print(classification_report(test_data_tags, predicted_tags))

        simple_conf_matrix = confusion_matrix(test_data_tags,predicted_tags)
        all_tags = sorted(list(set(test_data_tags)))
        conf_matrix = pd.DataFrame(columns = all_tags, index = all_tags)
        for x,y in zip(simple_conf_matrix, all_tags):
            conf_matrix[y] = x
        conf_matrix = conf_matrix.transpose()
        
        return conf_matrix

#### Step 4 Train the network on the prepared data and obtain the predictions on the test set

In [34]:
import os
os.getcwd()

'C:\\Users\\hacker\\AppData\\Local\\Temp\\2\\azureml_runs\\NLP_DL_EntityRecognition_1504996997761'

In [23]:
# from Data_Preparation2 import Data_Preparation2
# from Entity_Extractor import Entity_Extractor
#import cPickle as cp
from keras.models import load_model
import numpy as np

# TRAIN_FILEPATH = "Drugs_and_Diseases//train_out.txt"
# TEST_FILEPATH = "Drugs_and_Diseases//test.txt"
network_type= 'unidirectional'
# network_type= 'bidirectional'
vector_size = 50
num_classes = 7 + 1
seq_length = 613
num_layers = 1
num_epochs = 1

if __name__ == "__main__":
    print("Running on BIO-NLP data\n\n")    

    # Read the data
    print("Initializing data...")
    reader = Data_Preparation(num_classes, seq_length, local_train_file_path, local_test_file_path,\
                              embedding_pickle_file, vector_size)
        
    # Train the model
    print("Training model... epochs = {0}, layers = {1}".format(num_epochs, num_layers))
    model_file_path = os.path.join(home_dir,'Models/lstm_{}_model_lyrs_{}_epchs_{}_vs_{}_ws_{}_mc_{}.h5'.\
                  format(network_type, num_layers, num_epochs, vector_size,window_size, min_count ))
        
    K.clear_session()
    with K.get_session() as sess:        
        K.set_session(sess)
        graphr = K.get_session().graph
        with graphr.as_default():
            entityExtractor = Entity_Extractor(reader)
            entityExtractor.train(network_type = 'unidirectional', epochs=num_epochs, layers=num_layers)    

            #Save the model
            entityExtractor.save(model_file_path)
            
            # Evaluate the model
            print("Evaluating model...")
            confusion_matrix = entityExtractor.evaluate_model()
            print(confusion_matrix)

            print("Done.")     
            
                
        K.clear_session()
        K.set_session(None)

Running on BIO-NLP data


Initializing data...
Loading the W2V model from file C:\dl4nlp\Models/w2vmodel_pubmed_vs_50_ws_5_mc_400.pkl
Creating the lookup table
Number of entries in the lookup table = 126855
embedding size = 50
Loading the training data from file C:\dl4nlp\Data/Drugs_and_Diseases/train_out.txt
number of training examples = 15380
Loading test data from file C:\dl4nlp\Data/Drugs_and_Diseases/test.txt
number of test examples = 5075
UNK WORD COUNT = 20711
Found WORDS COUNT = 399059
TOTAL WORDS = 419770
Training model... epochs = 1, layers = 1
Data Shapes
(15380, 613)
(15380, 613, 8)
(5075, 613)
(5075, 613, 8)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 613, 50)           6342850   
_________________________________________________________________
lstm_1 (LSTM)                (None, 613, 150)          120600    
_________________________________________

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

  B-Disease       0.00      0.00      0.00      4049
     B-Drug       0.71      0.31      0.43      5297
  I-Disease       0.00      0.00      0.00      2218
     I-Drug       0.00      0.00      0.00       574
          O       0.91      1.00      0.95    105355

avg / total       0.85      0.91      0.87    117493

           B-Disease  B-Drug  I-Disease  I-Drug       O
B-Disease          0      71          0       0    3978
B-Drug             0    1643          0       0    3654
I-Disease          0       4          0       0    2214
I-Drug             0     107          0       0     467
O                  0     494          0       0  104861
Done.


NameError: name 'self' is not defined

#### Step 5 Generate the output of the model in the correct format for evaluation

In [15]:
# #save then load the model
# model.save('./outputs/my_model.h5')  # creates a HDF5 file 'my_model.h5'

# # returns a compiled model
# # identical to the previous one
# model_reloaded = load_model('./outputs/my_model.h5')

# # calculate predictions
# predictions = model_reloaded.predict(X)
# print('model reloaded')

model_file_path = os.path.join(home_dir,'./Models/lstm_model_lyrs_{}_epchs_{}_vs_{}_{}.h5'.\
                  format(num_layers, num_epochs, vector_size, network_type))
    
entityExtractor.save(model_file_path)

RuntimeError: Attempted to use a closed Session.

Step 5 Generate the output of the model in the correct format for evaluation

In [11]:
file1 = open("Pubmed_Output.txt")
file2 = open("Drugs_and_Diseases//test.txt")
target = open("Drugs_and_Diseases//eval2.txt", "w")

list1 = []
list2 = []

for line in file1:
    list1.append(line)
    
for line in file2:
    list2.append(line)
    
for ind, line in enumerate(list2):
    x = line.split("\t")
    if len(x) == 1:
        target.write("\n")
    else:
        target.write(x[0])
        target.write("\t")
        if list1[ind] == "NONE":
            target.write("O")
        else:
            target.write(list1[ind])
    ind += 1
    
file1.close()
file2.close()
target.close()

#### Evaluate the model predictions on the test data

In [12]:
!./Drugs_and_Diseases/evalD_a_D.pl Drugs_and_Diseases/eval2.txt Drugs_and_Diseases/test.txt #with Embedding Layer

                              Biomedical Entity Recognition Performance (Genaral)                                         
                                                                                         number(recall/precision/f-score) 
+------------------+---------------------------------+---------------------------------+---------------------------------+
|                  |          complete match         |       right boundary match      |       left boundary match       |
+------------------+---------------------------------+---------------------------------+---------------------------------+
|  Disease  (4083) | 2775 (67.96% / 70.36% / 69.14%) | 3211 (78.64% / 81.41% / 80.00%) | 2969 (72.72% / 75.28% / 73.98%) |
+------------------+---------------------------------+---------------------------------+---------------------------------+
|   Drug    (5392) | 4237 (78.58% / 75.43% / 76.97%) | 4299 (79.73% / 76.54% / 78.10%) | 4336 (80.42% / 77.19% / 78.77%) |
+-------