In [None]:
import pandas as pd
import numpy as np
import random
import pickle
from tqdm import tqdm
from thefuzz import process,fuzz

In [None]:
# create the data and models folders if they do not exist:
import os

data_dir = './data'
models_dir = './models'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

# load immunotherapy data and split into train and test cohorts

In [None]:
all_immunotherapy = pd.read_csv('./data/all_immunotherapy.csv', low_memory=False, sep=',')

In [None]:
all_immunotherapy.Description_Text=all_immunotherapy.Description_Text.str.lower()

In [None]:
# to delete
all_immunotherapy = all_immunotherapy.dropna(subset=['Description_Text'])

In [None]:
all_patients = all_immunotherapy['PatNum'].unique()

In [None]:
# Split the  patient into test and train cohorts
random.seed(42)
test_patients = random.sample(list(all_patients), k=int(len(all_patients) * 0.2))
train_patients = [x for x in all_patients if x not in test_patients]
pickle.dump((train_patients, test_patients),open('./data/train_test_patients.pickle','wb'))

In [None]:
train_text=all_immunotherapy.loc[all_immunotherapy.PatNum.isin(train_patients)].copy()
test_text=all_immunotherapy.loc[all_immunotherapy.PatNum.isin(test_patients)].copy()

# Detect all potential IrAE mentiones in the text

In [None]:
# Get all words in the database 
import re,string

all_words = set()
regex = re.compile('[%s]' % re.escape(string.punctuation))

for row in tqdm(all_immunotherapy.itertuples()):
    try:
        description_text = regex.sub('', row.Description_Text)
    except:
        print(row.Description_Text)
    all_words.update(description_text.split())

In [None]:
# Extract all the words in the corpus that are similar to IrAEs in Hebrew and English, providing a high sensitivity 
# Also allows for mispellings that are common in the EHRs. 

itis_words={'קוליטיס':'colitis', 'colitis':'colitis','דרמטיטיס':'dermatitis', 'dermatitis':'dermatitis', 
           'הפטיטיס':'hepatitis','hepatitis':'hepatitis', 'מיאסטניה':'myasthenia','myasthenia':'myasthenia',
           'מיוקרדיטיס':'myocarditis','myocarditis':'myocarditis','פנאומוניטיס':'pneumonitis','pneumonitis':'pneumonitis',
           'תירואידיטיס':'thyroiditis','thyroiditis':'thyroiditis'}

from collections import defaultdict

similar_words_dict = defaultdict(list)

for key in itis_words.keys():
    similar_words = process.extract(key, all_words, limit=500, scorer=fuzz.token_set_ratio)
    similar_words_final = [res[0] for res in similar_words if res[1] >= 80]
    similar_words_dict[itis_words[key]] += similar_words_final

# Extract all lines with IrAEs mentions in the train and test cohorts

In [None]:
import re
import string

def get_row_for_word(search_words, df,itis_category, n_words=5):
    result = []
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    
    for row in df.itertuples():
        description_text = regex.sub('', row.Description_Text)
        description_text_list = description_text.split()
        
        for i, word in enumerate(description_text_list):
            if word in search_words:
                start_index = max(0, i - n_words)
                end_index = min(i + n_words + 1, len(description_text_list))
                X = description_text_list[start_index:end_index]
                row_text = " ".join(X)
                row_context = ' '.join(description_text_list[max(0, i - n_words * 2):min(i + n_words * 2 + 1, len(description_text_list))])

                result.append([row.PatNum, row.Entry_Date, row.Description_Text, row.Note_id, X, row_text, row_context, itis_category, word])

    result = pd.DataFrame(result, columns=['PatNum', 'Entry_Date', 'Description_Text','Note_id','X', 'row_text', 'row_context', 'itis_category','word'])
    return result


## Train data

In [None]:
train_data_final=pd.concat([get_row_for_word(similar_words_dict[key],train_text,key, n_words=5) for key in tqdm(similar_words_dict)])

In [None]:
# Remove exact duplicates of the imidiate text fot the IrAE for the same patient, keeping the first mention
train_data_final=train_data_final.sort_values(by=['PatNum','Entry_Date'], ascending=True)
train_data_final=train_data_final.drop_duplicates(subset=['PatNum','row_text'], keep='first').copy()

## Test data

In [None]:
test_data_final=pd.concat([get_row_for_word(similar_words_dict[key],test_text,key, n_words=5) for key in tqdm(similar_words_dict)])

In [None]:
# Remove exact duplicates of the imidiate text fot the IrAE for the same patient, keeping the first mention
test_data_final=test_data_final.sort_values(by=['PatNum','Entry_Date'], ascending=True)
test_data_final=test_data_final.drop_duplicates(subset=['PatNum','row_text'], keep='first').copy()

# Labeling

In [None]:
import random
random.seed(42)

train_len = train_data_final.shape[0]
test_len = test_data_final.shape[0]
all_len = train_len + test_len

# create a shuffles index that will be used later to merge back the labeled notes 
indices = [x for x in range(all_len)]
random.shuffle(indices)

# As the index is suffled there 
train_data_final['label_id'] = indices[:train_len]
test_data_final['label_id'] = indices[train_len:]


In [None]:
# Join the train and test data for labeling
to_label=pd.concat([train_data_final[['row_text', 'row_context','label_id']],test_data_final[['row_text', 'row_context','label_id']]])
to_label=to_label.sort_values(by=['label_id'], ascending=True)
# The file for labeling
to_label.to_csv('./data/data_for_labeling.csv')

# Load labeled data and prepair the final data files

In [None]:
# Loading the final labeled data
labeled=pd.read_csv('./data/labeled_data.csv', sep=',', index_col=0)

In [None]:
train_data_final_exported=train_data_final[['PatNum', 'Entry_Date',
                                            'Note_id','row_text','X','itis_category', 'label_id','word']].merge(labeled[['label_id','clf']], on=['label_id'], how='left')

In [None]:
test_data_final_exported=test_data_final[['PatNum', 'Entry_Date',
                                            'Note_id','row_text','X','itis_category', 'label_id','word']].merge(labeled[['label_id','clf']], on=['label_id'], how='left')

In [None]:
# The final labeled data
data_for_model=train_data_final_exported,test_data_final_exported
import pickle
pickle.dump(data_for_model, open('./data/data_for_model.pickle','wb'))