In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


# hide warnings
import warnings
warnings.filterwarnings('ignore')

# update view options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
# read table for medication doses taken over 24 week period
m = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/raw_data_files/T_FRDOS.csv')

display(m.shape) # check shape and head
m[:5]

(160908, 19)

Unnamed: 0,PATIENTNUMBER,SITE,VISIT,PATIENTID,VISITID,DOS002,DOS002_UNIT,DOS002_NORM,DOS005,DOS005_UNIT,DOS005_NORM,DOS006,DOS006_UNIT,DOS006_NORM,VISITDT,DOS001,DOS001_DT,VISITDT_Dt,patdeid
0,,,WK0,,15034,2.0,,2.0,8.0,,8.0,1.0,,1.0,,,.,0.0,1
1,,,WK1,,15037,2.0,,2.0,16.0,,16.0,1.0,,1.0,,,.,6.0,1
2,,,WK1,,15037,2.0,,2.0,24.0,,24.0,1.0,,1.0,,,.,6.0,1
3,,,WK1,,15037,2.0,,2.0,24.0,,24.0,1.0,,1.0,,,.,6.0,1
4,,,WK1,,15037,2.0,,2.0,32.0,,32.0,1.0,,1.0,,,.,6.0,1


In [4]:
# retrieve column names
m.columns

Index(['PATIENTNUMBER', 'SITE', 'VISIT', 'PATIENTID', 'VISITID', 'DOS002',
       'DOS002_UNIT', 'DOS002_NORM', 'DOS005', 'DOS005_UNIT', 'DOS005_NORM',
       'DOS006', 'DOS006_UNIT', 'DOS006_NORM', 'VISITDT', 'DOS001',
       'DOS001_DT', 'VISITDT_Dt', 'patdeid'],
      dtype='object')

In [5]:
# drop columns whose data we are not using for this analysis

m = m.drop(columns=['PATIENTNUMBER','DOS001','SITE', 'PATIENTID', 'VISITID','DOS002_UNIT', 'DOS002_NORM','DOS005_UNIT', 'DOS005_NORM','DOS006_UNIT', 'DOS006_NORM', 'VISITDT','DOS001_DT', 'VISITDT_Dt'], axis=1)

m[:1] # check column names

Unnamed: 0,VISIT,DOS002,DOS005,DOS006,patdeid
0,WK0,2.0,8.0,1.0,1


In [6]:
# rename columns according to documentation for interpretability

new_columns = {'DOS002':'medication','DOS005':'total_dose','DOS006':'admin_location'}


In [7]:
# renaming columns

m = m.rename(columns=new_columns)

m[:1] # check column names

Unnamed: 0,VISIT,medication,total_dose,admin_location,patdeid
0,WK0,2.0,8.0,1.0,1


In [8]:
# reorder columns for easier interpretation

m = m.reindex(columns=['patdeid','VISIT','medication','total_dose','admin_location'])

m[:1] # check column names

Unnamed: 0,patdeid,VISIT,medication,total_dose,admin_location
0,1,WK0,2.0,8.0,1.0


In [9]:
# remove 'WK' from VISIT and convert to int for ordinal value
m.VISIT = m.VISIT.str.replace('WK', '')

# replace 'BASELINE' with 0 for ordinal value
m.VISIT = m.VISIT.replace('BASELINE', 0)

# convert VISIT to int
m.VISIT = m.VISIT.astype(np.int64)

In [10]:
# function to choose random patient id

def random_patient_id():
    return m.patdeid.sample().values[0]

In [11]:
# filter view to specific patient ID
# use groupby to index by VISIT to view all 24 visits and values for each visit

m.loc[m.patdeid==random_patient_id()].groupby('VISIT').agg('first')

Unnamed: 0_level_0,patdeid,medication,total_dose,admin_location
VISIT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,351,2.0,8.0,1.0
1,351,2.0,16.0,1.0
2,351,2.0,32.0,1.0
3,351,2.0,32.0,1.0
4,351,2.0,32.0,1.0
5,351,2.0,32.0,1.0
6,351,2.0,32.0,1.0
7,351,2.0,32.0,1.0
8,351,2.0,32.0,1.0
9,351,2.0,32.0,1.0


### There were issues with data collection for medication dose, notice the 0.0 values.  We will forward fill those values to maintain accuracy

In [12]:
m.medication.value_counts() # check value counts for medication

medication
2.0    79571
1.0    79054
Name: count, dtype: int64

In [13]:
# convert 0.0 value in total_dose to NaN
m['total_dose'] = m['total_dose'].replace(0.0, np.nan)

# in total_dose column, front fill nan values
m['total_dose'] = m['total_dose'].fillna(method='ffill')

In [14]:
m.total_dose.describe()[['min','max']]

min      1.0
max    397.0
Name: total_dose, dtype: float64

In [15]:
m.loc[m.patdeid==1].groupby('VISIT').agg('first')

Unnamed: 0_level_0,patdeid,medication,total_dose,admin_location
VISIT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,2.0,8.0,1.0
1,1,2.0,16.0,1.0
2,1,2.0,32.0,1.0
3,1,2.0,32.0,1.0
4,1,2.0,32.0,1.0
5,1,2.0,32.0,1.0
6,1,2.0,32.0,1.0
7,1,2.0,32.0,1.0
8,1,2.0,32.0,2.0
9,1,2.0,32.0,1.0


In [16]:
# back fill nan in medication column
m['medication'] = m['medication'].fillna(method='bfill')

In [17]:
# location doesn't provide significant predictive value, drop column
m = m.drop(columns=['admin_location'], axis=1)

In [18]:
# apply aggregation to total dose column to show the sum of medication consumed per week

m.groupby(['patdeid','VISIT']).agg({'medication':'first','total_dose':'sum'})[:5]

# aggregate medication dose to show the first dose and then the sum of medication consumed per week


Unnamed: 0_level_0,Unnamed: 1_level_0,medication,total_dose
patdeid,VISIT,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,2.0,8.0
1,1,2.0,160.0
1,2,2.0,320.0
1,3,2.0,192.0
1,4,2.0,384.0


In [19]:
# assign name to new df
m = m.groupby(['patdeid','VISIT']).agg({'medication':'first','total_dose':'sum'})

In [20]:
m = m.reset_index()

In [21]:
m[:5]

Unnamed: 0,patdeid,VISIT,medication,total_dose
0,1,0,2.0,8.0
1,1,1,2.0,160.0
2,1,2,2.0,320.0
3,1,3,2.0,192.0
4,1,4,2.0,384.0


In [22]:
m = m.loc[m.VISIT!=0]

In [23]:
m['total_visits'] = m.groupby('patdeid').VISIT.transform('nunique')

In [24]:
m = m.groupby('patdeid').agg({'medication':'first','total_dose':'sum','total_visits':'first'}).reset_index()

In [25]:
m['total_dose_visit'] = m.total_dose / m.total_visits

In [26]:
m['total_dose_visit'] = m['total_dose_visit'].round(2)

In [27]:
m[:5]

Unnamed: 0,patdeid,medication,total_dose,total_visits,total_dose_visit
0,1,2.0,5188.0,24,216.17
1,2,2.0,1940.0,24,80.83
2,3,1.0,13035.0,24,543.12
3,4,2.0,5304.0,24,221.0
4,6,2.0,32.0,2,16.0


In [28]:
m['metha_dose_visit'] = m.loc[m.medication==1.0]['total_dose_visit']
m['bupe_dose_visit'] = m.loc[m.medication==2.0]['total_dose_visit']

m = m.fillna(0)

In [29]:
m[:5]

Unnamed: 0,patdeid,medication,total_dose,total_visits,total_dose_visit,metha_dose_visit,bupe_dose_visit
0,1,2.0,5188.0,24,216.17,0.0,216.17
1,2,2.0,1940.0,24,80.83,0.0,80.83
2,3,1.0,13035.0,24,543.12,543.12,0.0
3,4,2.0,5304.0,24,221.0,0.0,221.0
4,6,2.0,32.0,2,16.0,0.0,16.0


In [30]:
m.to_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/eda_data/medication.csv', index=False)

In [None]:
# create a new dataframe for every filter of visit column
# the name of the dataframe with be VISIT+number of visit
for i in med_agg['VISIT'].unique():
    globals()['VISIT%s' % i] = med_agg[med_agg['VISIT']==i]

In [None]:
# for each dataframe beteween VISIT0 and VISIT24
# add the value in VISIT to the end of the name of each column +"-"+"visit"
# do not change the patdeid column
for i in range(0,25):
    for col in globals()['VISIT%s' % i].columns:
        if col != 'patdeid':
            globals()['VISIT%s' % i][col+'_'+str(i)] = globals()['VISIT%s' % i][col]
            globals()['VISIT%s' % i] = globals()['VISIT%s' % i].drop(columns=col)

In [None]:
# print shape and name of ea dataframe next to eachother
for i in range(0,25):
    print(globals()['VISIT%s' % i].shape, 'VISIT%s' % i)

In [None]:
# merge all dfs using left merge on patdeid
for i in range(0,25):
    if i == 0:
        df = pd.merge(globals()['VISIT%s' % i], globals()['VISIT%s' % (i+1)], on=['patdeid'], how='left')
    elif i < 24:
        df = pd.merge(df, globals()['VISIT%s' % (i+1)], on=['patdeid'], how='left')
    else:
        pass

In [None]:
df[:5]

In [None]:
list(df.columns)

In [None]:
df = df.drop(columns=[col for col in df.columns if col.startswith('VISIT')], axis=1)

In [None]:
list(df.columns)

In [None]:
df = df.fillna(0)

In [None]:
df.to_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/feature_engineering_data/medication.csv', index=False)