# Preparations

In [None]:
import pandas as pd
import re,string
import numpy as np
from tqdm import tqdm
import pickle
import math
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc

In [None]:
import tensorflow as tf
# tf.keras.backend.set_floatx('float16')
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
train_data_final_exported,test_data_final_exported= pickle.load(open('./data/data_for_model.pickle','rb'))

# Testing using finetuned AlephBert

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
general_model_checkpoint = "onlplab/alephbert-base"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(general_model_checkpoint)

In [None]:
test=test_data_final_exported[['row_text','clf']].copy()
y_test=test.clf.values

In [None]:
from datasets import Features, Value, ClassLabel,load_dataset,Dataset
features_load = Features({'row_text': Value('string'), 'clf': ClassLabel(num_classes=2)})

In [None]:
test_df=Dataset.from_pandas(test.reset_index(drop=True),features=features_load)

In [None]:
def tokenize_function(example):
    return tokenizer(example['row_text'])

In [None]:
tokenized_test = test_df.map(tokenize_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
tf_test_dataset = tokenized_test.to_tf_dataset(
    columns=["input_ids",'token_type_ids',"attention_mask"],
    label_cols=["clf"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=32,
)

In [None]:
finetuned_model = './models/aleph_bert_finetuned'

In [None]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers.schedules import PolynomialDecay
import random
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

batch_size = 32
num_epochs = 5
tf.keras.mixed_precision.set_global_policy('mixed_float16')

model = TFAutoModelForSequenceClassification.from_pretrained(finetuned_model, num_labels=2)
lr_scheduler = PolynomialDecay(
    initial_learning_rate=1e-5, end_learning_rate=0.0, decay_steps=300
)
opt = Adam(learning_rate=lr_scheduler)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss)

In [None]:
preds = model.predict(tf_test_dataset)["logits"]

In [None]:
def logit_to_prob(logit):
    odds= math.exp(logit)
    return odds/(1+odds)

y_prob_bert=[logit_to_prob(x) for x in preds[:,1]]

In [None]:
fpr, tpr, _=roc_curve(y_test,y_prob_bert)
roc_auc=auc(fpr, tpr)

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.3f)'% roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

# Testing using the FastText-LSTM model

In [None]:
from gensim.models import FastText
fasttext_model=FastText.load('./models/medical_fast_text.model')

In [None]:
X_test=test_data_final_exported.X.values
y_test=test_data_final_exported.clf.values

In [None]:
X_test_final=[]
for x in X_test:
    new_x=x.copy()
    while len(new_x)<11:
        new_x.append('nan_word')
    X_test_final.append(new_x)

In [None]:
X_test_fasttext=[]
for x in X_test_final:
    row=[]
    for word in x:
        if word=='nan_word':
            row.append([0]*300)
        else:
            row.append(fasttext_model.wv[word])
    row=np.array(row)
    X_test_fasttext.append(row)
    

In [None]:
X_test_fasttext=np.stack(X_test_fasttext)

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input,Bidirectional,Dropout,Multiply
from tensorflow.keras.layers import LSTM
import tensorflow as tf
callbacks = tf.keras.callbacks

In [None]:
# The model architecture

inputA = Input(shape=(X_test_fasttext.shape[1],X_test_fasttext.shape[2],))
x = Bidirectional(LSTM(50, return_sequences=False))(inputA)
x=Dropout(0.3)(x)
x=Dense(10, activation='relu')(x)
prefinal=Dense(5, activation='relu')(x)
final = Dense(1, activation='sigmoid')(prefinal)
model = tf.keras.Model(inputs=[inputA], outputs=final)

In [None]:
opt = tf.keras.optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
model.load_weights('./models/fast_text_best.hdf5')

In [None]:
fast_text_results=model.predict(X_test_fasttext)
fpr, tpr, _=roc_curve(y_test,fast_text_results)
roc_auc=auc(fpr, tpr)

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.3f)'% roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

# Joined model

In [None]:
mean_prob=[(x+y)/2 for x,y in zip(fast_text_results,y_prob_bert)]

In [None]:
y_pred_joined=[1 if y>=0.5 else 0 for y in mean_prob]

In [None]:
fpr, tpr, _=roc_curve(y_test, mean_prob)
roc_auc=auc(fpr, tpr)

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.3f)'% roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC of the ensemble model')
plt.legend(loc="lower right")
plt.show()

# Compare the IrAEs predicted vs Labeled

In [None]:
_, test_data=pickle.load(open('./data/data_for_model.pickle','rb'))

In [None]:
test_data['predicted']=y_pred_joined

## Predicted results including the use of clustering

In [None]:
# Extract all phrases predicted to contain positive mentions of an IrAE using the prediction models
IrAE_predicted= test_data[['PatNum','itis_category','Note_id','Entry_Date']].loc[test_data.predicted==1].copy()

In [None]:
# Remove cases where the same IrAE is mentioned twice in the same note
IrAE_predicted=IrAE_predicted.drop_duplicates()

In [None]:
# Patients were considered to have an IrAE if there were at least two notes mentioning the condition
predicted_IrAE_dict={}
for IrAE in IrAE_predicted.itis_category.unique():
        IrAE_data=IrAE_predicted.loc[IrAE_predicted.itis_category==IrAE].copy()
        IrAE_data=IrAE_data.groupby(by=['PatNum']).count().reset_index()
        more_than_1=IrAE_data.loc[IrAE_data.itis_category>1].copy()
        predicted_IrAE_dict[IrAE]=more_than_1.PatNum.unique()

## Results according to physician labeling

In [None]:
# Extract all phrases that were labeled by a physician to contain an IrAE 
IrAE_true= test_data[['PatNum','itis_category','Note_id','Entry_Date']].loc[test_data.clf==1].copy()

In [None]:
# Remove cases where the same IrAE is mentioned twice in the same note
IrAE_true=IrAE_true.drop_duplicates()

In [None]:
# Patients were considered to have an IrAE if there were at least two notes mentioning the condition
true_IrAE_dict={}
for IrAE in IrAE_true.itis_category.unique():
        IrAE_data=IrAE_true.loc[IrAE_true.itis_category==IrAE].copy()
        IrAE_data=IrAE_data.groupby(by=['PatNum']).count().reset_index()
        more_than_1=IrAE_data.loc[IrAE_data.itis_category>1].copy()
        true_IrAE_dict[IrAE]=more_than_1.PatNum.unique()

## Sensitivity, specificity, accuracy and F1 score

In [None]:
_,test_patients=pickle.load(open('./data/train_test_patients.pickle','rb'))
all_patients=len(test_patients)
results=[]
for IrAE in true_IrAE_dict:
    real=true_IrAE_dict[IrAE]
    predicted=predicted_IrAE_dict[IrAE]
    TP=sum([1 if x in real else 0 for x in predicted])
    FP=len(predicted)-TP
    FN=len(real)-TP
    TN=all_patients-len(real)-FP
    sensitivity=TP/len(real)
    specificity=TN/(TN+FP)
    precision=TP/(TP+FP)
    f1=2*(precision*sensitivity)/(precision+sensitivity)
    accuracy=(TN+TP)/all_patients
    results.append([IrAE,sensitivity, specificity,f1,accuracy])
    
results=pd.DataFrame(results, columns=['IrAE','sensitivity', 'specificity','f1','accuracy'])

In [None]:
print(results)