In [1]:
import pandas as pd
from datetime import datetime
from dateutil import relativedelta
import random
import pickle
from tqdm import tqdm_notebook  

In [2]:
event_types={
    "px_medications_injections_infusions_and_other_forms_":0,
    "dx_brnch_lng_ca":1,
    "px_pathology":2,
    "px_therapeutic_radiology":3,
    "px_laboratory_chemistry_and_hematology":4,
    "px_ct_scan_abdomen":5,
    "dx_ot_perint_dx":6,
    "px_other_diagnostic_procedures_on_lung_and_bronchus":7,
    "dx_2ndary_malig":8,
    "px_ct_scan_chest":9,
    "dx_anemia":10,
    "px_radioisotope_scan_and_function_studies":11,
    "px_other_ct_scan":12,
    "px_magnetic_resonance_imaging":13,
    "px_diagnostic_bronchoscopy_and_biopsy_of_bronchus":14,
    "px_lobectomy_or_pneumonectomy":15,
    "px_dme_and_supplies":16,
    "rx_antineoplastic_agents":17,
    "dx_fx_arm":18,
    "px_incision_of_pleura_thoracentesis_chest_drainage":19    
}

In [3]:
from datetime import date

def timediff(date1,date2):
    end_date = (date2)
    start_date = (date1)
    num_days = (end_date-start_date).days 
    if (num_days==0):
        return None
    return num_days/30.5

In [4]:
def get_all_patients(df_name):
    unique_patients=set()
    df1=pd.read_csv(df_name)
    for i in range(len(df1)):
        unique_patients.add(df1.loc[i,"person_id"])
    return unique_patients

In [5]:
# dates = df[df['person_id']=='a']['event_date'].tolist()
# dates.sort(key=lambda date: datetime.strptime(date, "%d/%m/%Y"))

In [6]:
def get_data(df,patient_id):
    
    patient_df = df[df['person_id']==patient_id]

    patient_df['event_date'] = pd.to_datetime(patient_df['event_date'])

    patient_df = patient_df.sort_values(by='event_date')

    start_date = patient_df['event_date'].min()

    patient_df['last_event_dt'] = patient_df['event_date'].shift(1).fillna(method='bfill')

    patient_df[['event_date','last_event_dt']]

    patient_df['time_since_last_event'] = patient_df.apply(lambda x: timediff(x['last_event_dt'],x['event_date']),axis=1)

    patient_df[['event_date','last_event_dt','time_since_last_event']]

    patient_df['time_since_start'] = patient_df.apply(lambda x: timediff(start_date,x['event_date']),axis=1)
    
    patient_df['type_event'] = patient_df['event_label'].apply(lambda x: event_types[x])

    patient_df.drop_duplicates(subset =["time_since_start","event_label"], inplace = True) 

    patient_df['idx_event'] = [i for i in range(1,patient_df.shape[0]+1)]

    patient_df = patient_df[['person_id','idx_event','type_event','time_since_start','time_since_last_event']].rename(columns={'person_id':'patient_id'})

    patient_df['time_since_start'].fillna(0,inplace=True)

    patient_df['time_since_last_event'].fillna(0,inplace=True)

    patient_df = patient_df.dropna(subset=['time_since_last_event'])

    patient_df["same_day_event_rank"] = patient_df.groupby('time_since_start')['idx_event'].cumcount()

    patient_df['time_since_start_adj'] = patient_df.apply(lambda x: x['time_since_start']+(0.0001*x['same_day_event_rank']),axis=1)

    patient_df['time_since_last_event_adj'] = patient_df.apply(lambda x: x['time_since_last_event']+(0.0001*x['same_day_event_rank']),axis=1)
    
    patient_df['time_since_start']=patient_df['time_since_start_adj']
    
    patient_df['time_since_last_event']=patient_df['time_since_last_event_adj']
    
#     dropped additional columns created after copying time
    patient_df.drop(["same_day_event_rank","time_since_start_adj","time_since_last_event_adj"],axis='columns', inplace=True)

    patient_df.reset_index(drop=True , inplace=True)
    
   
#     patient_df['time_since_last_event'].fillna(method='ffill',inplace=True)
#     patient_df['time_since_last_event'].fillna(0,inplace=True)
    l = list(patient_df.T.to_dict().values())
    return l

In [7]:
#     patient_id="abcd"
    
#     patient_df = df[df['person_id']==patient_id]

#     patient_df['event_date'] = pd.to_datetime(patient_df['event_date'])

#     patient_df = patient_df.sort_values(by='event_date')

#     start_date = patient_df['event_date'].min()

#     patient_df['last_event_dt'] = patient_df['event_date'].shift(1).fillna(method='bfill')

#     patient_df[['event_date','last_event_dt']]

#     patient_df['time_since_last_event'] = patient_df.apply(lambda x: timediff(x['last_event_dt'],x['event_date']),axis=1)

#     patient_df[['event_date','last_event_dt','time_since_last_event']]

#     patient_df['time_since_start'] = patient_df.apply(lambda x: timediff(start_date,x['event_date']),axis=1)
    
#     patient_df['type_event'] = patient_df['event_label'].apply(lambda x: event_types[x])

#     patient_df.drop_duplicates(subset =["time_since_start","event_label"], inplace = True) 

#     patient_df['idx_event'] = [i for i in range(1,patient_df.shape[0]+1)]

#     patient_df = patient_df[['person_id','idx_event','type_event','time_since_start','time_since_last_event']].rename(columns={'person_id':'patient_id'})

#     patient_df['time_since_start'].fillna(0,inplace=True)

#     patient_df['time_since_last_event'].fillna(0,inplace=True)

#     patient_df = patient_df.dropna(subset=['time_since_last_event'])

#     patient_df["same_day_event_rank"] = patient_df.groupby('time_since_start')['idx_event'].cumcount()

#     patient_df['time_since_start_adj'] = patient_df.apply(lambda x: x['time_since_start']+(0.01*x['same_day_event_rank']),axis=1)

#     patient_df['time_since_last_event_adj'] = patient_df.apply(lambda x: x['time_since_last_event']+(0.01*x['same_day_event_rank']),axis=1)
    
#     patient_df['time_since_start']=patient_df['time_since_start_adj']
    
#     patient_df['time_since_last_event']=patient_df['time_since_last_event_adj']
    
# #     dropped additional columns created after copying time
#     patient_df.drop(["same_day_event_rank","time_since_start_adj","time_since_last_event_adj"],axis='columns', inplace=True)

#     patient_df.reset_index(drop=True , inplace=True)
    
#     display(patient_df)


In [8]:
def find_key(dict,val):
    for key,value in dict.items():
        if(value==val):
            return key

In [9]:
def remove_data(df,patient_list,impute_rate,impute_value):
    s="event_label=={}".format(impute_value)
    df=df.drop(df[df["event_label"]==impute_value].sample(frac=impute_rate,random_state=1).index).reset_index(drop=True)
    obs_list=[]
    for x in patient_list:
        obs_list.append(get_data(df,x))
    return obs_list

In [10]:
#driver
df_name = "data/test/sample_data.csv"
train_patients=[]
test_patients=[]
dev_patients=[]
patients=get_all_patients(df_name)
patients=list(patients)
train_patients_size=0.7*len(patients)
test_patients_size=0.2*len(patients)
dev_patients_size=0.1*len(patients)
count=0

while(count<train_patients_size):
    train_patients.append(patients[count])
    count+=1
while(count<train_patients_size+test_patients_size):
    test_patients.append(patients[count])
    count+=1
while(count<len(patients)):
      dev_patients.append(patients[count])
      count+=1
#train data is ready
df=pd.read_csv(df_name)

train_seqs=[]
test_seqs=[]
dev_seqs=[]
      
for x in tqdm_notebook(train_patients):
    temp = get_data(df,x)
    if temp:
        train_seqs.append(temp)
for x in tqdm_notebook(test_patients):
    temp = get_data(df,x)
    if temp:
        test_seqs.append(temp)
for x in tqdm_notebook(dev_patients):
    temp = get_data(df,x)
    if temp:
        dev_seqs.append(temp)
        
# print(train_seqs)

key=find_key(event_types,8)

train_obs=remove_data(df,train_patients,1,key)
test_obs=remove_data(df,test_patients,1,key)
dev_obs=remove_data(df,dev_patients,1,key)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [11]:
def store_as_pickle(seqs,seqs_obs,total_num,file_name):
    if len(seqs) == len(seqs_obs):
        final_data = {
            'seqs': seqs,
            'seqs_obs': seqs_obs,
            'total_num':total_num
        }
        f = open("data/test/{}".format(file_name),'wb') 
        # source, destination 
        pickle.dump(final_data, f)                      
        f.close() 
    else:
        print("shape does not match")

In [12]:
store_as_pickle(train_seqs,train_obs,20,"train.pkl")
store_as_pickle(test_seqs,test_obs,20,"test.pkl")
store_as_pickle(dev_seqs,dev_obs,20,"dev.pkl")

## QC Steps

In [13]:
print(set(train_patients).intersection(set(test_patients)))
print(set(train_patients).intersection(set(dev_patients)))
print(set(test_patients).intersection(set(dev_patients)))

set()
set()
set()


In [14]:
## function to convert pkl to df
def convert_pkl_to_df(pkl):
    df = pd.DataFrame()
    for i in tqdm_notebook(range(0,len(pkl))):
        temp_df = (pd.DataFrame(pkl[i]))
        temp_df['seq_id'] = i
        df = pd.concat([df,temp_df])
    return df.reset_index(drop=True)

In [15]:
print(len(train_seqs))
print(len(test_seqs))
print(len(dev_seqs))

70
20
10


Further QC checks can be found in Exploring_2.ipynb