In [1]:
import pandas as pd # only pandas required for this notebook

In [2]:
# read table for medication doses taken over 24 week period
medication_dose = pd.read_csv('../data/T_FRDOS.csv')

medication_dose.head()

Unnamed: 0,PATIENTNUMBER,SITE,VISIT,PATIENTID,VISITID,DOS002,DOS002_UNIT,DOS002_NORM,DOS005,DOS005_UNIT,DOS005_NORM,DOS006,DOS006_UNIT,DOS006_NORM,VISITDT,DOS001,DOS001_DT,VISITDT_Dt,patdeid
0,,,WK0,,15034,2.0,,2.0,8.0,,8.0,1.0,,1.0,,,.,0.0,1
1,,,WK1,,15037,2.0,,2.0,16.0,,16.0,1.0,,1.0,,,.,6.0,1
2,,,WK1,,15037,2.0,,2.0,24.0,,24.0,1.0,,1.0,,,.,6.0,1
3,,,WK1,,15037,2.0,,2.0,24.0,,24.0,1.0,,1.0,,,.,6.0,1
4,,,WK1,,15037,2.0,,2.0,32.0,,32.0,1.0,,1.0,,,.,6.0,1


In [3]:
# retrieve column names
medication_dose.columns

Index(['PATIENTNUMBER', 'SITE', 'VISIT', 'PATIENTID', 'VISITID', 'DOS002',
       'DOS002_UNIT', 'DOS002_NORM', 'DOS005', 'DOS005_UNIT', 'DOS005_NORM',
       'DOS006', 'DOS006_UNIT', 'DOS006_NORM', 'VISITDT', 'DOS001',
       'DOS001_DT', 'VISITDT_Dt', 'patdeid'],
      dtype='object')

In [4]:
# drop columns whose data we are not using for this analysis

medication_dose = medication_dose.drop(columns=['PATIENTNUMBER','DOS001','SITE', 'PATIENTID', 'VISITID','DOS002_UNIT', 'DOS002_NORM','DOS005_UNIT', 'DOS005_NORM','DOS006_UNIT', 'DOS006_NORM', 'VISITDT','DOS001_DT', 'VISITDT_Dt'], axis=1)

medication_dose.head()

Unnamed: 0,VISIT,DOS002,DOS005,DOS006,patdeid
0,WK0,2.0,8.0,1.0,1
1,WK1,2.0,16.0,1.0,1
2,WK1,2.0,24.0,1.0,1
3,WK1,2.0,24.0,1.0,1
4,WK1,2.0,32.0,1.0,1


In [5]:
# rename columns according to documentation for interpretability

new_columns = {'DOS002':'medication','DOS005':'total_dose','DOS006':'admin_location'}


In [6]:
# renaming columns

medication_dose = medication_dose.rename(columns=new_columns)

medication_dose.head()

Unnamed: 0,VISIT,medication,total_dose,admin_location,patdeid
0,WK0,2.0,8.0,1.0,1
1,WK1,2.0,16.0,1.0,1
2,WK1,2.0,24.0,1.0,1
3,WK1,2.0,24.0,1.0,1
4,WK1,2.0,32.0,1.0,1


In [7]:
# reorder columns for easier interpretation

medication_dose = medication_dose.reindex(columns=['patdeid','medication','total_dose','admin_location'])

medication_dose.head()

Unnamed: 0,patdeid,medication,total_dose,admin_location
0,1,2.0,8.0,1.0
1,1,2.0,16.0,1.0
2,1,2.0,24.0,1.0
3,1,2.0,24.0,1.0
4,1,2.0,32.0,1.0


In [8]:
# check for class inbalance of medication prescribed, 1 = methadone, 2 = buprenorphine

medication_dose.medication.value_counts()

medication
2.0    79571
1.0    79054
Name: count, dtype: int64

In [12]:
# unique data points for total_dose column

len(medication_dose.total_dose.unique())

176

In [10]:
# distinct values within total_dose column

medication_dose.total_dose.unique()

array([  8.,  16.,  24.,  32.,   0.,  nan,  30.,  26.,  28.,  22.,  12.,
        40.,  50.,  60.,  65.,  70.,  75.,  80.,  90.,  95., 100.,  85.,
        14.,   2.,   4.,  20., 110., 135., 120., 130., 140.,  18.,  10.,
         6.,  55.,  98.,  96.,  94.,  92.,  88.,  86.,  35.,  45., 150.,
       155., 160., 144.,  63.,  59.,  61.,  57.,  53.,  51.,  47.,  49.,
       138., 136., 134., 132., 128., 126., 124.,  56., 105., 115.,  72.,
        48.,  58.,  68.,  78., 108., 113., 170., 190., 215., 240., 270.,
       300., 330., 360., 390.,  99.,  93.,  91.,  89.,  87.,  84.,  83.,
        82.,  81.,  79.,  77.,  76.,  74.,  73.,  71.,  69.,  67.,  66.,
        64.,  62.,  54.,  52.,  46.,  44.,  43.,  42.,  41., 125., 145.,
       133., 131., 129., 127.,  37.,  36., 102.,  39., 175., 200.,   3.,
        15.,  38.,  25.,  23.,  34., 180.,  31.,  19.,  13.,   7.,  17.,
        27.,  21.,   9., 117., 114., 111.,  11.,   5., 123.,  33.,  29.,
       109., 104., 195., 112., 121., 119., 107., 10

In [13]:
# print range of values for total_dose column
medication_dose.total_dose.min(), medication_dose.total_dose.max()

(0.0, 397.0)

In [11]:
# print length of patdeid column, numbe of patients is consistent with CTN protocol

medication_dose.patdeid.nunique()

1315

In [9]:
# check for distribution of admin location values, 1 = Clinic, 2 = Take Home
medication_dose.admin_location.value_counts()

admin_location
1.0    137214
2.0     21333
Name: count, dtype: int64

In [14]:
# save to CSV, then pass to layer 2 cleaning for merge

medication_dose.to_csv('../data/medication_dose.csv', index=False)