In [37]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


# hide warnings
import warnings
warnings.filterwarnings('ignore')

# update view options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [38]:
# read table for medication doses taken over 24 week period
m = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/raw_data_files/T_FRDOS.csv')

display(m.shape) # check shape and head
m[:5]

(160908, 19)

Unnamed: 0,PATIENTNUMBER,SITE,VISIT,PATIENTID,VISITID,DOS002,DOS002_UNIT,DOS002_NORM,DOS005,DOS005_UNIT,DOS005_NORM,DOS006,DOS006_UNIT,DOS006_NORM,VISITDT,DOS001,DOS001_DT,VISITDT_Dt,patdeid
0,,,WK0,,15034,2.0,,2.0,8.0,,8.0,1.0,,1.0,,,.,0.0,1
1,,,WK1,,15037,2.0,,2.0,16.0,,16.0,1.0,,1.0,,,.,6.0,1
2,,,WK1,,15037,2.0,,2.0,24.0,,24.0,1.0,,1.0,,,.,6.0,1
3,,,WK1,,15037,2.0,,2.0,24.0,,24.0,1.0,,1.0,,,.,6.0,1
4,,,WK1,,15037,2.0,,2.0,32.0,,32.0,1.0,,1.0,,,.,6.0,1


In [39]:
# retrieve column names
m.columns

Index(['PATIENTNUMBER', 'SITE', 'VISIT', 'PATIENTID', 'VISITID', 'DOS002',
       'DOS002_UNIT', 'DOS002_NORM', 'DOS005', 'DOS005_UNIT', 'DOS005_NORM',
       'DOS006', 'DOS006_UNIT', 'DOS006_NORM', 'VISITDT', 'DOS001',
       'DOS001_DT', 'VISITDT_Dt', 'patdeid'],
      dtype='object')

In [40]:
# drop columns whose data we are not using for this analysis

m = m.drop(columns=['PATIENTNUMBER','DOS001','SITE', 'PATIENTID', 'VISITID','DOS002_UNIT', 'DOS002_NORM','DOS005_UNIT', 'DOS005_NORM','DOS006_UNIT', 'DOS006_NORM', 'VISITDT','DOS001_DT', 'VISITDT_Dt'], axis=1)

m[:1] # check column names

Unnamed: 0,VISIT,DOS002,DOS005,DOS006,patdeid
0,WK0,2.0,8.0,1.0,1


In [41]:
# rename columns according to documentation for interpretability

new_columns = {'DOS002':'medication','DOS005':'total_dose','DOS006':'admin_location'}


In [42]:
# renaming columns

m = m.rename(columns=new_columns)

m[:1] # check column names

Unnamed: 0,VISIT,medication,total_dose,admin_location,patdeid
0,WK0,2.0,8.0,1.0,1


In [43]:
# reorder columns for easier interpretation

m = m.reindex(columns=['patdeid','VISIT','medication','total_dose','admin_location'])

m[:1] # check column names

Unnamed: 0,patdeid,VISIT,medication,total_dose,admin_location
0,1,WK0,2.0,8.0,1.0


In [44]:
# remove 'WK' from VISIT and convert to int for ordinal value
m.VISIT = m.VISIT.str.replace('WK', '')

# replace 'BASELINE' with 0 for ordinal value
m.VISIT = m.VISIT.replace('BASELINE', 0)

# convert VISIT to int
m.VISIT = m.VISIT.astype(np.int64)

In [45]:
# choose random number from patdeid column



In [46]:
# filter view to specific patient ID
# use groupby to index by VISIT to view all 24 visits and values for each visit

m.loc[m.patdeid==1220].groupby('VISIT').agg('first')

Unnamed: 0_level_0,patdeid,medication,total_dose,admin_location
VISIT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1220,1.0,40.0,1.0
1,1220,1.0,50.0,1.0
2,1220,1.0,50.0,1.0
3,1220,1.0,60.0,1.0
4,1220,1.0,60.0,1.0
5,1220,1.0,60.0,1.0
6,1220,1.0,60.0,1.0
7,1220,1.0,65.0,1.0
8,1220,1.0,65.0,1.0
9,1220,1.0,65.0,1.0


### There were issues with data collection for medication dose, notice the 0.0 values.  We will forward fill those values to maintain accuracy

In [47]:
m.medication.value_counts() # check value counts for medication

medication
2.0    79571
1.0    79054
Name: count, dtype: int64

In [48]:
# convert 0.0 value in total_dose to NaN
m['total_dose'] = m['total_dose'].replace(0.0, np.nan)

# in total_dose column, front fill nan values
m['total_dose'] = m['total_dose'].fillna(method='ffill')

In [49]:
m.total_dose.unique()

array([  8.,  16.,  24.,  32.,  30.,  26.,  28.,  22.,  12.,  40.,  50.,
        60.,  65.,  70.,  75.,  80.,  90.,  95., 100.,  85.,  14.,   2.,
         4.,  20., 110., 135., 120., 130., 140.,  18.,  10.,   6.,  55.,
        98.,  96.,  94.,  92.,  88.,  86.,  35.,  45., 150., 155., 160.,
       144.,  63.,  59.,  61.,  57.,  53.,  51.,  47.,  49., 138., 136.,
       134., 132., 128., 126., 124.,  56., 105., 115.,  72.,  48.,  58.,
        68.,  78., 108., 113., 170., 190., 215., 240., 270., 300., 330.,
       360., 390.,  99.,  93.,  91.,  89.,  87.,  84.,  83.,  82.,  81.,
        79.,  77.,  76.,  74.,  73.,  71.,  69.,  67.,  66.,  64.,  62.,
        54.,  52.,  46.,  44.,  43.,  42.,  41., 125., 145., 133., 131.,
       129., 127.,  37.,  36., 102.,  39., 175., 200.,   3.,  15.,  38.,
        25.,  23.,  34., 180.,  31.,  19.,  13.,   7.,  17.,  27.,  21.,
         9., 117., 114., 111.,  11.,   5., 123.,  33.,  29., 109., 104.,
       195., 112., 121., 119., 107., 103.,  97., 16

In [50]:
m.loc[m.patdeid==1].groupby('VISIT').agg('first')

Unnamed: 0_level_0,patdeid,medication,total_dose,admin_location
VISIT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,2.0,8.0,1.0
1,1,2.0,16.0,1.0
2,1,2.0,32.0,1.0
3,1,2.0,32.0,1.0
4,1,2.0,32.0,1.0
5,1,2.0,32.0,1.0
6,1,2.0,32.0,1.0
7,1,2.0,32.0,1.0
8,1,2.0,32.0,2.0
9,1,2.0,32.0,1.0


### Dive into each column and map values to binary (1 else 0)

In [51]:
# location doesn't provide significant predictive value, drop column
m = m.drop(columns=['admin_location'], axis=1)

There appears to be an issues with duplicates; requires further analysis

In [52]:
# sample random patient IDs to see pattern for duplicates

# create feature to pull random sample from patdeid column
def random_patient(df):
    return df.sample(1).patdeid.values[0]


In [53]:
# show view for patients 25 rows at a time
m.loc[m.patdeid==random_patient(m)][25:50]

Unnamed: 0,patdeid,VISIT,medication,total_dose
103427,1240,4,1.0,40.0
103428,1240,4,1.0,40.0
103429,1240,4,1.0,40.0
103430,1240,4,1.0,40.0
103431,1240,4,1.0,40.0
103432,1240,4,1.0,40.0
103433,1240,4,1.0,40.0
103434,1240,4,1.0,40.0
103435,1240,4,1.0,40.0
103436,1240,4,1.0,40.0


### We will take  a few steps to reshape the dataframe from approximately 106,000 rows, to 1 row per patient.

- First, we will create a column that counts how many weeks of treatment each patient completed<br>
- Then we will aggregate the total amount of medication each patient consumed within 24 weeks<br>
- The two data points above will allow use to calculate the average daily and weekly dose per patient<br>

Reshaping the data in this way protects the data integrity and reduces the number of erroneous features and improves accuracy for machine learning


In [54]:
# apply aggregation to total dose column to show the sum of medication consumed per week

m.groupby(['patdeid','VISIT']).agg({'medication':'first','total_dose':'sum'})[:5]

# aggregate medication dose to show the first dose and then the sum of medication consumed per week


Unnamed: 0_level_0,Unnamed: 1_level_0,medication,total_dose
patdeid,VISIT,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,2.0,8.0
1,1,2.0,160.0
1,2,2.0,320.0
1,3,2.0,192.0
1,4,2.0,384.0


In [55]:
# assign name to new df
med_agg = m.groupby(['patdeid','VISIT']).agg({'medication':'first','total_dose':'sum'})

In [56]:
med_agg = med_agg.reset_index()

In [57]:
med_agg.medication.isna().sum() # check for null values

1445

In [58]:
# back fill null values in medication column
med_agg.medication = med_agg.medication.fillna(method='bfill')

In [59]:
med_agg.medication.isna().sum()

0

In [60]:
med_agg.medication.value_counts() # check value counts for medication

medication
2.0    12063
1.0    11465
Name: count, dtype: int64

In [61]:
# create a new dataframe for every filter of visit column
# the name of the dataframe with be VISIT+number of visit
for i in med_agg['VISIT'].unique():
    globals()['VISIT%s' % i] = med_agg[med_agg['VISIT']==i]

In [62]:
# for each dataframe beteween VISIT0 and VISIT24
# add the value in VISIT to the end of the name of each column +"-"+"visit"
# do not change the patdeid column
for i in range(0,25):
    for col in globals()['VISIT%s' % i].columns:
        if col != 'patdeid':
            globals()['VISIT%s' % i][col+'_'+str(i)] = globals()['VISIT%s' % i][col]
            globals()['VISIT%s' % i] = globals()['VISIT%s' % i].drop(columns=col)

In [63]:
# print shape and name of ea dataframe next to eachother
for i in range(0,25):
    print(globals()['VISIT%s' % i].shape, 'VISIT%s' % i)

(1314, 4) VISIT0
(1239, 4) VISIT1
(1212, 4) VISIT2
(1151, 4) VISIT3
(1127, 4) VISIT4
(1061, 4) VISIT5
(1028, 4) VISIT6
(995, 4) VISIT7
(983, 4) VISIT8
(947, 4) VISIT9
(920, 4) VISIT10
(896, 4) VISIT11
(899, 4) VISIT12
(862, 4) VISIT13
(847, 4) VISIT14
(835, 4) VISIT15
(836, 4) VISIT16
(807, 4) VISIT17
(797, 4) VISIT18
(786, 4) VISIT19
(790, 4) VISIT20
(772, 4) VISIT21
(761, 4) VISIT22
(753, 4) VISIT23
(910, 4) VISIT24


In [64]:
# merge all dfs using left merge on patdeid
for i in range(0,25):
    if i == 0:
        df = pd.merge(globals()['VISIT%s' % i], globals()['VISIT%s' % (i+1)], on=['patdeid'], how='left')
    elif i < 24:
        df = pd.merge(df, globals()['VISIT%s' % (i+1)], on=['patdeid'], how='left')
    else:
        pass

In [65]:
df[:5]

Unnamed: 0,patdeid,VISIT_0,medication_0,total_dose_0,VISIT_1,medication_1,total_dose_1,VISIT_2,medication_2,total_dose_2,VISIT_3,medication_3,total_dose_3,VISIT_4,medication_4,total_dose_4,VISIT_5,medication_5,total_dose_5,VISIT_6,medication_6,total_dose_6,VISIT_7,medication_7,total_dose_7,VISIT_8,medication_8,total_dose_8,VISIT_9,medication_9,total_dose_9,VISIT_10,medication_10,total_dose_10,VISIT_11,medication_11,total_dose_11,VISIT_12,medication_12,total_dose_12,VISIT_13,medication_13,total_dose_13,VISIT_14,medication_14,total_dose_14,VISIT_15,medication_15,total_dose_15,VISIT_16,medication_16,total_dose_16,VISIT_17,medication_17,total_dose_17,VISIT_18,medication_18,total_dose_18,VISIT_19,medication_19,total_dose_19,VISIT_20,medication_20,total_dose_20,VISIT_21,medication_21,total_dose_21,VISIT_22,medication_22,total_dose_22,VISIT_23,medication_23,total_dose_23,VISIT_24,medication_24,total_dose_24
0,1,0,2.0,8.0,1.0,2.0,160.0,2.0,2.0,320.0,3.0,2.0,192.0,4.0,2.0,384.0,5.0,2.0,96.0,6.0,2.0,96.0,7.0,2.0,352.0,8.0,2.0,128.0,9.0,2.0,256.0,10.0,2.0,256.0,11.0,2.0,224.0,12.0,2.0,448.0,13.0,2.0,32.0,14.0,2.0,224.0,15.0,2.0,224.0,16.0,2.0,240.0,17.0,2.0,182.0,18.0,2.0,182.0,19.0,2.0,240.0,20.0,2.0,210.0,21.0,2.0,180.0,22.0,2.0,246.0,23.0,2.0,128.0,24.0,2.0,188.0
1,2,0,2.0,8.0,1.0,2.0,64.0,2.0,2.0,68.0,3.0,2.0,84.0,4.0,2.0,60.0,5.0,2.0,108.0,6.0,2.0,84.0,7.0,2.0,96.0,8.0,2.0,36.0,9.0,2.0,96.0,10.0,2.0,88.0,11.0,2.0,112.0,12.0,2.0,104.0,13.0,2.0,56.0,14.0,2.0,88.0,15.0,2.0,160.0,16.0,2.0,80.0,17.0,2.0,72.0,18.0,2.0,56.0,19.0,2.0,56.0,20.0,2.0,56.0,21.0,2.0,80.0,22.0,2.0,84.0,23.0,2.0,84.0,24.0,2.0,68.0
2,3,0,1.0,30.0,1.0,1.0,170.0,2.0,1.0,350.0,3.0,1.0,420.0,4.0,1.0,420.0,5.0,1.0,540.0,6.0,1.0,310.0,7.0,1.0,455.0,8.0,1.0,455.0,9.0,1.0,480.0,10.0,1.0,600.0,11.0,1.0,455.0,12.0,1.0,560.0,13.0,1.0,800.0,14.0,1.0,600.0,15.0,1.0,360.0,16.0,1.0,640.0,17.0,1.0,700.0,18.0,1.0,700.0,19.0,1.0,800.0,20.0,1.0,600.0,21.0,1.0,765.0,22.0,1.0,630.0,23.0,1.0,510.0,24.0,1.0,715.0
3,4,0,2.0,16.0,1.0,2.0,248.0,2.0,2.0,256.0,3.0,2.0,160.0,4.0,2.0,96.0,5.0,2.0,416.0,6.0,2.0,256.0,7.0,2.0,224.0,8.0,2.0,224.0,9.0,2.0,224.0,10.0,2.0,224.0,11.0,2.0,224.0,12.0,2.0,320.0,13.0,2.0,160.0,14.0,2.0,256.0,15.0,2.0,160.0,16.0,2.0,320.0,17.0,2.0,128.0,18.0,2.0,256.0,19.0,2.0,192.0,20.0,2.0,448.0,21.0,2.0,64.0,22.0,2.0,160.0,23.0,2.0,192.0,24.0,2.0,96.0
4,6,0,2.0,16.0,1.0,2.0,16.0,2.0,2.0,16.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [66]:
list(df.columns)

['patdeid',
 'VISIT_0',
 'medication_0',
 'total_dose_0',
 'VISIT_1',
 'medication_1',
 'total_dose_1',
 'VISIT_2',
 'medication_2',
 'total_dose_2',
 'VISIT_3',
 'medication_3',
 'total_dose_3',
 'VISIT_4',
 'medication_4',
 'total_dose_4',
 'VISIT_5',
 'medication_5',
 'total_dose_5',
 'VISIT_6',
 'medication_6',
 'total_dose_6',
 'VISIT_7',
 'medication_7',
 'total_dose_7',
 'VISIT_8',
 'medication_8',
 'total_dose_8',
 'VISIT_9',
 'medication_9',
 'total_dose_9',
 'VISIT_10',
 'medication_10',
 'total_dose_10',
 'VISIT_11',
 'medication_11',
 'total_dose_11',
 'VISIT_12',
 'medication_12',
 'total_dose_12',
 'VISIT_13',
 'medication_13',
 'total_dose_13',
 'VISIT_14',
 'medication_14',
 'total_dose_14',
 'VISIT_15',
 'medication_15',
 'total_dose_15',
 'VISIT_16',
 'medication_16',
 'total_dose_16',
 'VISIT_17',
 'medication_17',
 'total_dose_17',
 'VISIT_18',
 'medication_18',
 'total_dose_18',
 'VISIT_19',
 'medication_19',
 'total_dose_19',
 'VISIT_20',
 'medication_20',
 'total

In [71]:
med_del = [col for col in df.columns if col.startswith('medication_')]

In [73]:
med_del.remove('medication_0')

In [74]:
df = df.drop(columns=med_del, axis=1)

In [75]:
df[:5]

Unnamed: 0,patdeid,VISIT_0,medication_0,total_dose_0,VISIT_1,total_dose_1,VISIT_2,total_dose_2,VISIT_3,total_dose_3,VISIT_4,total_dose_4,VISIT_5,total_dose_5,VISIT_6,total_dose_6,VISIT_7,total_dose_7,VISIT_8,total_dose_8,VISIT_9,total_dose_9,VISIT_10,total_dose_10,VISIT_11,total_dose_11,VISIT_12,total_dose_12,VISIT_13,total_dose_13,VISIT_14,total_dose_14,VISIT_15,total_dose_15,VISIT_16,total_dose_16,VISIT_17,total_dose_17,VISIT_18,total_dose_18,VISIT_19,total_dose_19,VISIT_20,total_dose_20,VISIT_21,total_dose_21,VISIT_22,total_dose_22,VISIT_23,total_dose_23,VISIT_24,total_dose_24
0,1,0,2.0,8.0,1.0,160.0,2.0,320.0,3.0,192.0,4.0,384.0,5.0,96.0,6.0,96.0,7.0,352.0,8.0,128.0,9.0,256.0,10.0,256.0,11.0,224.0,12.0,448.0,13.0,32.0,14.0,224.0,15.0,224.0,16.0,240.0,17.0,182.0,18.0,182.0,19.0,240.0,20.0,210.0,21.0,180.0,22.0,246.0,23.0,128.0,24.0,188.0
1,2,0,2.0,8.0,1.0,64.0,2.0,68.0,3.0,84.0,4.0,60.0,5.0,108.0,6.0,84.0,7.0,96.0,8.0,36.0,9.0,96.0,10.0,88.0,11.0,112.0,12.0,104.0,13.0,56.0,14.0,88.0,15.0,160.0,16.0,80.0,17.0,72.0,18.0,56.0,19.0,56.0,20.0,56.0,21.0,80.0,22.0,84.0,23.0,84.0,24.0,68.0
2,3,0,1.0,30.0,1.0,170.0,2.0,350.0,3.0,420.0,4.0,420.0,5.0,540.0,6.0,310.0,7.0,455.0,8.0,455.0,9.0,480.0,10.0,600.0,11.0,455.0,12.0,560.0,13.0,800.0,14.0,600.0,15.0,360.0,16.0,640.0,17.0,700.0,18.0,700.0,19.0,800.0,20.0,600.0,21.0,765.0,22.0,630.0,23.0,510.0,24.0,715.0
3,4,0,2.0,16.0,1.0,248.0,2.0,256.0,3.0,160.0,4.0,96.0,5.0,416.0,6.0,256.0,7.0,224.0,8.0,224.0,9.0,224.0,10.0,224.0,11.0,224.0,12.0,320.0,13.0,160.0,14.0,256.0,15.0,160.0,16.0,320.0,17.0,128.0,18.0,256.0,19.0,192.0,20.0,448.0,21.0,64.0,22.0,160.0,23.0,192.0,24.0,96.0
4,6,0,2.0,16.0,1.0,16.0,2.0,16.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [76]:
df[:5]

Unnamed: 0,patdeid,VISIT_0,medication_0,total_dose_0,VISIT_1,total_dose_1,VISIT_2,total_dose_2,VISIT_3,total_dose_3,VISIT_4,total_dose_4,VISIT_5,total_dose_5,VISIT_6,total_dose_6,VISIT_7,total_dose_7,VISIT_8,total_dose_8,VISIT_9,total_dose_9,VISIT_10,total_dose_10,VISIT_11,total_dose_11,VISIT_12,total_dose_12,VISIT_13,total_dose_13,VISIT_14,total_dose_14,VISIT_15,total_dose_15,VISIT_16,total_dose_16,VISIT_17,total_dose_17,VISIT_18,total_dose_18,VISIT_19,total_dose_19,VISIT_20,total_dose_20,VISIT_21,total_dose_21,VISIT_22,total_dose_22,VISIT_23,total_dose_23,VISIT_24,total_dose_24
0,1,0,2.0,8.0,1.0,160.0,2.0,320.0,3.0,192.0,4.0,384.0,5.0,96.0,6.0,96.0,7.0,352.0,8.0,128.0,9.0,256.0,10.0,256.0,11.0,224.0,12.0,448.0,13.0,32.0,14.0,224.0,15.0,224.0,16.0,240.0,17.0,182.0,18.0,182.0,19.0,240.0,20.0,210.0,21.0,180.0,22.0,246.0,23.0,128.0,24.0,188.0
1,2,0,2.0,8.0,1.0,64.0,2.0,68.0,3.0,84.0,4.0,60.0,5.0,108.0,6.0,84.0,7.0,96.0,8.0,36.0,9.0,96.0,10.0,88.0,11.0,112.0,12.0,104.0,13.0,56.0,14.0,88.0,15.0,160.0,16.0,80.0,17.0,72.0,18.0,56.0,19.0,56.0,20.0,56.0,21.0,80.0,22.0,84.0,23.0,84.0,24.0,68.0
2,3,0,1.0,30.0,1.0,170.0,2.0,350.0,3.0,420.0,4.0,420.0,5.0,540.0,6.0,310.0,7.0,455.0,8.0,455.0,9.0,480.0,10.0,600.0,11.0,455.0,12.0,560.0,13.0,800.0,14.0,600.0,15.0,360.0,16.0,640.0,17.0,700.0,18.0,700.0,19.0,800.0,20.0,600.0,21.0,765.0,22.0,630.0,23.0,510.0,24.0,715.0
3,4,0,2.0,16.0,1.0,248.0,2.0,256.0,3.0,160.0,4.0,96.0,5.0,416.0,6.0,256.0,7.0,224.0,8.0,224.0,9.0,224.0,10.0,224.0,11.0,224.0,12.0,320.0,13.0,160.0,14.0,256.0,15.0,160.0,16.0,320.0,17.0,128.0,18.0,256.0,19.0,192.0,20.0,448.0,21.0,64.0,22.0,160.0,23.0,192.0,24.0,96.0
4,6,0,2.0,16.0,1.0,16.0,2.0,16.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [80]:
df = df.drop(columns=[col for col in df.columns if col.startswith('VISIT')], axis=1)

In [81]:
df[:5]

Unnamed: 0,patdeid,medication_0,total_dose_0,total_dose_1,total_dose_2,total_dose_3,total_dose_4,total_dose_5,total_dose_6,total_dose_7,total_dose_8,total_dose_9,total_dose_10,total_dose_11,total_dose_12,total_dose_13,total_dose_14,total_dose_15,total_dose_16,total_dose_17,total_dose_18,total_dose_19,total_dose_20,total_dose_21,total_dose_22,total_dose_23,total_dose_24
0,1,2.0,8.0,160.0,320.0,192.0,384.0,96.0,96.0,352.0,128.0,256.0,256.0,224.0,448.0,32.0,224.0,224.0,240.0,182.0,182.0,240.0,210.0,180.0,246.0,128.0,188.0
1,2,2.0,8.0,64.0,68.0,84.0,60.0,108.0,84.0,96.0,36.0,96.0,88.0,112.0,104.0,56.0,88.0,160.0,80.0,72.0,56.0,56.0,56.0,80.0,84.0,84.0,68.0
2,3,1.0,30.0,170.0,350.0,420.0,420.0,540.0,310.0,455.0,455.0,480.0,600.0,455.0,560.0,800.0,600.0,360.0,640.0,700.0,700.0,800.0,600.0,765.0,630.0,510.0,715.0
3,4,2.0,16.0,248.0,256.0,160.0,96.0,416.0,256.0,224.0,224.0,224.0,224.0,224.0,320.0,160.0,256.0,160.0,320.0,128.0,256.0,192.0,448.0,64.0,160.0,192.0,96.0
4,6,2.0,16.0,16.0,16.0,,,,,,,,,,,,,,,,,,,,,,


In [82]:
df = df.rename(columns={'medication_0':'medication'})

In [83]:
df = df.fillna(0)

In [84]:
df[:5]

Unnamed: 0,patdeid,medication,total_dose_0,total_dose_1,total_dose_2,total_dose_3,total_dose_4,total_dose_5,total_dose_6,total_dose_7,total_dose_8,total_dose_9,total_dose_10,total_dose_11,total_dose_12,total_dose_13,total_dose_14,total_dose_15,total_dose_16,total_dose_17,total_dose_18,total_dose_19,total_dose_20,total_dose_21,total_dose_22,total_dose_23,total_dose_24
0,1,2.0,8.0,160.0,320.0,192.0,384.0,96.0,96.0,352.0,128.0,256.0,256.0,224.0,448.0,32.0,224.0,224.0,240.0,182.0,182.0,240.0,210.0,180.0,246.0,128.0,188.0
1,2,2.0,8.0,64.0,68.0,84.0,60.0,108.0,84.0,96.0,36.0,96.0,88.0,112.0,104.0,56.0,88.0,160.0,80.0,72.0,56.0,56.0,56.0,80.0,84.0,84.0,68.0
2,3,1.0,30.0,170.0,350.0,420.0,420.0,540.0,310.0,455.0,455.0,480.0,600.0,455.0,560.0,800.0,600.0,360.0,640.0,700.0,700.0,800.0,600.0,765.0,630.0,510.0,715.0
3,4,2.0,16.0,248.0,256.0,160.0,96.0,416.0,256.0,224.0,224.0,224.0,224.0,224.0,320.0,160.0,256.0,160.0,320.0,128.0,256.0,192.0,448.0,64.0,160.0,192.0,96.0
4,6,2.0,16.0,16.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
df.to_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/feature_engineering_data/medication.csv', index=False)