In [None]:
import pandas as pd
import pickle
import re, string
import numpy as np
from tqdm import tqdm
import random

# Medical adaptation of the FastText model

In [None]:
# This part was conducted using the gensim package implementation:  https://radimrehurek.com/gensim/models/fasttext.html
# The pretrained FastText vectors are availible for download at:  https://fasttext.cc/docs/en/crawl-vectors.html
# optional code:
import urllib.request
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.bin.gz'
filename = './models/cc.he.300.bin.gz'
urllib.request.urlretrieve(url, filename)

In [None]:
# Load the pretrained model
from gensim.models.fasttext import load_facebook_model
prev_model = load_facebook_model('./models/cc.he.300.bin.gz')

In [None]:
# Load pretraining data
pretraining_data= pickle.load(open('./data/pretraining_data.pickle','rb'))

In [None]:
# get all the words in the pretraining data
regex_num = r'\d'
prep_text = []
regex = re.compile('[%s]' % re.escape(string.punctuation))

for row in pretraining_data.itertuples():
    row_text = regex.sub('', row.text)
    row_text = re.sub(regex_num, '', row_text)
    words = row_text.split()
    prep_text.append(words)

In [None]:
# Train the model using the new data - adding new vocabulary
prev_model.build_vocab(prep_text, update=True)
prev_model.train(prep_text, total_examples=prev_model.corpus_count, epochs=5)

In [None]:
# save the new model
prev_model.save('./models/medical_fast_text.model')

# Train an LSTM model usnig the FastText embedding 

## Encode the training data

In [None]:
# Load the new gensim model
from gensim.models import FastText
fasttext_model=FastText.load('./models/medical_fast_text.model')

In [None]:
# load the trainign and 
train_data_final_exported,test_data_final_exported=pickle.load(open('./data/data_for_model.pickle','rb'))

In [None]:
X_train=train_data_final_exported.X.values

In [None]:
y_train=train_data_final_exported.clf.values

In [None]:
# pad all the sequences 
X_train_final=[]
for x in X_train:
    new_x=x.copy()
    while len(new_x)<11:
        new_x.append('nan_word')
    X_train_final.append(new_x)

In [None]:
# Translate the words into vectors using the fasttext model. Padding is encoded using a zero vector. 
X_train_fasttext=[]
for x in X_train_final:
    row=[]
    for word in x:
        if word=='nan_word':
            row.append([0]*300)
        else:
            row.append(fasttext_model.wv[word])
    row=np.array(row)
    X_train_fasttext.append(row) 

In [None]:
X_train_fasttext=np.stack(X_train_fasttext)

In [None]:
train_data=[X_train_fasttext,y_train]

In [None]:
pickle.dump(train_data, open('./data/train_fasttexyt_encoded.pickle','wb'))

## Train an LSTM model

In [None]:
train_data=pickle.load(open('./data/train_fasttexyt_encoded.pickle','rb'))

In [None]:
# Tensorflow configuration for improved stabitity

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
X_train,y_train=train_data

In [None]:
# Create the validation cohort from the train data
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input,Bidirectional,Dropout,Multiply
from tensorflow.keras.layers import LSTM
import tensorflow as tf
callbacks = tf.keras.callbacks

In [None]:
# The model tensorflow architecture

inputA = Input(shape=(X_train.shape[1],X_train.shape[2],))
x = Bidirectional(LSTM(50, return_sequences=False))(inputA)
x=Dropout(0.3)(x)
x=Dense(10, activation='relu')(x)
prefinal=Dense(5, activation='relu')(x)
final = Dense(1, activation='sigmoid')(prefinal)
model = tf.keras.Model(inputs=[inputA], outputs=final)

In [None]:
# Hyperparameter search for best model - tested versus the validation cohort

from sklearn.metrics import roc_auc_score

# Hyperparameter searched
learning_rates = [0.0001, 0.001, 0.01]
optimizers = ['RMSprop', 'Adam', 'SGD']
weight_initializations = ['random_normal', 'random_uniform', 'glorot_uniform']
batch_sizes = [128, 256, 512]

# Running a grid-search over the parameters and keeping the best model
max_auc = 0
counter=len(learning_rates)*len(optimizers)*len(weight_initializations)*len(batch_sizes)*3
for i in range(3):
    random.seed(i)
    for lr in learning_rates:
        for optimizer_name in optimizers:
            for weight_init in weight_initializations:
                for batch_size in batch_sizes:
                        opt = getattr(tf.keras.optimizers, optimizer_name)(learning_rate=lr)
                        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
                        epochs = 2000
                        early_stopCB = callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
                        mcp_save = callbacks.ModelCheckpoint(f'./models/fast_text_check.hdf5', save_best_only=True,
                                                              monitor='val_loss', mode='min')
                        tbCB = callbacks.TensorBoard(log_dir='Graph', histogram_freq=0, write_graph=True, write_images=True)
                        hist = model.fit(x=[X_train], y=y_train,
                                         batch_size=batch_size,
                                         epochs=epochs,
                                         verbose=0,
                                         callbacks=[early_stopCB, mcp_save, tbCB],
                                         validation_data=[X_val, y_val],
                                         shuffle=True)
                        model.load_weights('./models/fast_text_check.hdf5')
                        roc_auc = roc_auc_score(y_val,model.predict(X_val))
                        if roc_auc > max_auc:
                            max_auc = roc_auc
                            print(f"Max AUC: {max_auc} | Learning Rate: {lr} | Optimizer: {optimizer_name} | Weight Initialization: {weight_init} | Batch Size: {batch_size}")
                            model.save_weights('./models/fast_text_best.hdf5')
