In [None]:
import pandas as pd
import numpy as np
import pickle

Prepare the information regarding the patients examine before for requesting additional hospital data after the initial follow-up.  

In [None]:
id_to_drug=pd.read_csv(r'./data/Id_to_drug.csv', parse_dates=['min_date','max_added'], index_col=0)

In [None]:
future_dates=id_to_drug[['PatNum','max_added']].copy()

In [None]:
future_dates['cutoff_date']='2022-10-31'

In [None]:
future_dates[['PatNum','max_added','cutoff_date']].to_csv('to_extract_additional_notes.csv')

In [None]:
# This data was sent to the hospital to retrieve all notes for the relevant patients between the max_added and cutoff_date
# The file returned was named - after_treatment_data.csv

# Get notes potentially containing IrAEs

In [None]:
after_treatment_data=pd.read_csv('./data/after_treatment_data.csv', low_memory=False, index_col=0)

In [None]:
after_treatment_data=after_treatment_data.dropna(subset=['Description_Text'])

In [None]:
def get_words(text_file):
    # Get all words in the database 
    import re,string
    all_words = set()
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    for row in text_file.itertuples():
        try:
            description_text = regex.sub('', row.Description_Text)
        except:
            pass
        all_words.update(description_text.split())
        
    #  Get similar words to IrAEs in Hebrew and English   
    from thefuzz import process,fuzz
    itis_words={'קוליטיס':'colitis', 'colitis':'colitis','דרמטיטיס':'dermatitis', 'dermatitis':'dermatitis', 
           'הפטיטיס':'hepatitis','hepatitis':'hepatitis', 'מיאסטניה':'myasthenia','myasthenia':'myasthenia',
           'מיוקרדיטיס':'myocarditis','myocarditis':'myocarditis','פנאומוניטיס':'pneumonitis','pneumonitis':'pneumonitis',
           'תירואידיטיס':'thyroiditis','thyroiditis':'thyroiditis'}
    from collections import defaultdict
    similar_words_dict = defaultdict(list)
    for key in itis_words.keys():
        similar_words = process.extract(key, all_words, limit=500, scorer=fuzz.token_set_ratio)
        similar_words_final = [res[0] for res in similar_words if res[1] >= 80]
        similar_words_dict[itis_words[key]] += similar_words_final
    return similar_words_dict

In [None]:
def get_row_for_word(search_words, df,itis_category, n_words=5):
    import re,string
    result = []
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    
    for row in df.itertuples():
        description_text = regex.sub('', row.Description_Text)
        description_text_list = description_text.split()
        for i, word in enumerate(description_text_list):
            if word in search_words:
                start_index = max(0, i - n_words)
                end_index = min(i + n_words + 1, len(description_text_list))
                X = description_text_list[start_index:end_index]
                row_text = " ".join(X)
                row_context = ' '.join(description_text_list[max(0, i - n_words * 2):min(i + n_words * 2 + 1, len(description_text_list))])

                result.append([row.PatNum, row.Entry_Date, row.Description_Text, X, row_text, row_context, itis_category, word])

    result = pd.DataFrame(result, columns=['PatNum', 'Entry_Date', 'Description_Text','X', 'row_text', 'row_context', 'itis_category','word'])
    return result

# Functions for running the prediction using both pretrained AlphaBert and FastText models 

In [None]:
def run_the_joined_prediction(test_data):
    import re,string
    import math
    import tensorflow as tf
    from transformers import TFAutoModelForSequenceClassification
    from tensorflow.keras.optimizers.schedules import PolynomialDecay
    import random
    from tensorflow.keras.optimizers import Adam
    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification,DataCollatorWithPadding
    from datasets import Features, Value, ClassLabel,load_dataset,Dataset
    from tensorflow.keras.utils import to_categorical
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Input,Bidirectional,Dropout,Multiply, LSTM
    callbacks = tf.keras.callbacks

    #   Helper funtions
    def logit_to_prob(logit):
        odds= math.exp(logit)
        return odds/(1+odds)
    
    def tokenize_function(example):
        return tokenizer(example['row_text'])

    #  Alphabert model
    finetuned_model = './models/aleph_bert_finetuned'
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)
    general_model_checkpoint = "onlplab/alephbert-base"
    tokenizer = AutoTokenizer.from_pretrained(general_model_checkpoint)
    features_load = Features({'row_text': Value('string')})
    test_df=Dataset.from_pandas(test_data[['row_text']].reset_index(drop=True),features=features_load)
    tokenized_test = test_df.map(tokenize_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
    tf_test_dataset = tokenized_test.to_tf_dataset(
    columns=["input_ids",'token_type_ids',"attention_mask"], shuffle=False, collate_fn=data_collator,batch_size=32,)

    batch_size = 32
    num_epochs = 5
    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    model = TFAutoModelForSequenceClassification.from_pretrained(finetuned_model, num_labels=2)
    lr_scheduler = PolynomialDecay(
        initial_learning_rate=1e-5, end_learning_rate=0.0, decay_steps=300
    )
    opt = Adam(learning_rate=lr_scheduler)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=opt, loss=loss)
    preds = model.predict(tf_test_dataset)["logits"]
    y_prob_bert=[logit_to_prob(x) for x in preds[:,1]]
    
    ##  FastText model
    from gensim.models import FastText
    fasttext_model=FastText.load('./models/medical_fast_text.model')
    X_test_fasttext=[]
    for x in test_data['X']:
        row=[]
        for word in x:
            row.append(fasttext_model.wv[word])
        while len(row)<11:
            row.append([0]*300)
        X_test_fasttext.append(row)
    X_test_fasttext=np.stack(X_test_fasttext)

    # The model LSTM architecture
    inputA = Input(shape=(X_test_fasttext.shape[1],X_test_fasttext.shape[2],))
    x = Bidirectional(LSTM(50, return_sequences=False))(inputA)
    x=Dense(10, activation='relu')(x)
    prefinal=Dense(5, activation='relu')(x)
    final = Dense(1, activation='sigmoid')(prefinal)
    model = tf.keras.Model(inputs=[inputA], outputs=final)
    opt = tf.keras.optimizers.RMSprop(learning_rate=0.001)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    model.load_weights('./models/fast_text_best.hdf5')
    fast_text_results=model.predict(X_test_fasttext)
    mean_prob=[(x+y)/2 for x,y in zip(fast_text_results,y_prob_bert)]
    y_pred_joined=[1 if y>=0.5 else 0 for y in mean_prob]
    return y_pred_joined

# Run all the process and time it 

In [None]:
%%time
# get the similar word to IrAEs
similar_words_dict=get_words(after_treatment_data)

# get the phrases where these words are present in the text. 
from joblib import Parallel, delayed
def process_key(key):
    return get_row_for_word(similar_words_dict[key], after_treatment_data, key, n_words=5)
final_rows = pd.concat(Parallel(n_jobs=-1)(delayed(process_key)(key) for key in similar_words_dict))
final_data_for_prediction=final_rows[['PatNum', 'Entry_Date','row_text','X','itis_category']].copy()
results=run_the_joined_prediction(final_data_for_prediction)
final_data_for_prediction['clf']=results

# IrAEs are considered positive if they are reported positively more than one time in the EHR. 
final_data_for_prediction=final_data_for_prediction.merge(id_to_drug[['PatNum','drug']], on=['PatNum'], how='left')
final_data_for_prediction_pos=final_data_for_prediction.loc[final_data_for_prediction.clf==1].copy()
grouped_pos = final_data_for_prediction_pos.groupby(['drug', 'itis_category', 'PatNum'])['Entry_Date'].count().reset_index(name='DateCount')
grouped_pos=grouped_pos.loc[grouped_pos.DateCount>1].copy()
print(grouped_pos.groupby(['drug', 'itis_category'])['PatNum'].count())