In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import functools as ft # 

pd.set_option('display.max_columns', None)

In [2]:
# read in 4 csv files from '../data/clean_data/' folder

# t1 == Medication Doses
t1 = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/feature_engineering_data/medication.csv')

# t2 == Self Reported Drug Use
t2 = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/feature_engineering_data/self_reported_use.csv')

# t3 == Urine Drug Screen
t3 = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/feature_engineering_data/urine_drug_screen.csv')

# target == Treatment Outcome
target = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/feature_engineering_data/target.csv')

In [3]:
# merge dfs, use .reduce() to treat all 4 dfs as one iterable

dfs = [t1, t2, t3, target]

t4 = ft.reduce(lambda left, right: pd.merge(left, right, on='patdeid'), dfs)

In [4]:
t4.shape

(1345, 156)

# Feature Engineering
We must remove the last 4 weeks of opiate tests to prevent data leakage into the model

In [5]:
# drop columns that show opiate tests for the final 4 weeks of treatment
t4.drop(columns=[col for col in t4.columns if 't_Opiate300_21' in col or 't_Opiate300_22' in col or 't_Opiate300_23' in col or 't_Opiate300_24' in col], inplace=True)

In [6]:
t4.drop(columns='sru_opiates_24', inplace=True)

In [7]:
list(t4.columns)

['patdeid',
 'medication',
 'total_visits',
 'metha_dose_visit_1',
 'bupe_dose_visit_1',
 'metha_dose_visit_2',
 'bupe_dose_visit_2',
 'metha_dose_visit_3',
 'bupe_dose_visit_3',
 'metha_dose_visit_4',
 'bupe_dose_visit_4',
 'metha_dose_visit_5',
 'bupe_dose_visit_5',
 'metha_dose_visit_6',
 'bupe_dose_visit_6',
 'metha_dose_visit_7',
 'bupe_dose_visit_7',
 'metha_dose_visit_8',
 'bupe_dose_visit_8',
 'metha_dose_visit_9',
 'bupe_dose_visit_9',
 'metha_dose_visit_10',
 'bupe_dose_visit_10',
 'metha_dose_visit_11',
 'bupe_dose_visit_11',
 'metha_dose_visit_12',
 'bupe_dose_visit_12',
 'metha_dose_visit_13',
 'bupe_dose_visit_13',
 'metha_dose_visit_14',
 'bupe_dose_visit_14',
 'metha_dose_visit_15',
 'bupe_dose_visit_15',
 'metha_dose_visit_16',
 'bupe_dose_visit_16',
 'metha_dose_visit_17',
 'bupe_dose_visit_17',
 'metha_dose_visit_18',
 'bupe_dose_visit_18',
 'metha_dose_visit_19',
 'bupe_dose_visit_19',
 'metha_dose_visit_20',
 'bupe_dose_visit_20',
 'metha_dose_visit_21',
 'bupe_dos

In [8]:
t4 = t4.drop(columns=['sru_other','Unnamed: 0',])

In [8]:
# drop patdeid column
t4.drop('patdeid', axis=1, inplace=True)

In [9]:
# check value counts for dtypes, make sure there are no strings
t4.dtypes.value_counts()

float64    147
int64        3
Name: count, dtype: int64

In [10]:
t4.isna().sum().sum()

9414

### Data is cleaned properly and ready for machine learning

In [11]:
# check shape and head 
display(t4.shape)
display(t4.head())


(1345, 150)

Unnamed: 0,medication,total_visits,metha_dose_visit_1,bupe_dose_visit_1,metha_dose_visit_2,bupe_dose_visit_2,metha_dose_visit_3,bupe_dose_visit_3,metha_dose_visit_4,bupe_dose_visit_4,metha_dose_visit_5,bupe_dose_visit_5,metha_dose_visit_6,bupe_dose_visit_6,metha_dose_visit_7,bupe_dose_visit_7,metha_dose_visit_8,bupe_dose_visit_8,metha_dose_visit_9,bupe_dose_visit_9,metha_dose_visit_10,bupe_dose_visit_10,metha_dose_visit_11,bupe_dose_visit_11,metha_dose_visit_12,bupe_dose_visit_12,metha_dose_visit_13,bupe_dose_visit_13,metha_dose_visit_14,bupe_dose_visit_14,metha_dose_visit_15,bupe_dose_visit_15,metha_dose_visit_16,bupe_dose_visit_16,metha_dose_visit_17,bupe_dose_visit_17,metha_dose_visit_18,bupe_dose_visit_18,metha_dose_visit_19,bupe_dose_visit_19,metha_dose_visit_20,bupe_dose_visit_20,metha_dose_visit_21,bupe_dose_visit_21,metha_dose_visit_22,bupe_dose_visit_22,metha_dose_visit_23,bupe_dose_visit_23,metha_dose_visit_24,bupe_dose_visit_24,Unnamed: 0_x,sru_alcohol_0,sru_cannabis_0,sru_cocaine_0,sru_amphetamine_0,sru_methamphetamine_0,sru_opiates_0,sru_benzodiazepines_0,sru_propoxyphene_0,sru_methadone_0,sru_oxycodone_0,sru_other_0,sru_alcohol_4,sru_cannabis_4,sru_cocaine_4,sru_amphetamine_4,sru_methamphetamine_4,sru_opiates_4,sru_benzodiazepines_4,sru_propoxyphene_4,sru_methadone_4,sru_oxycodone_4,sru_other_4,sru_alcohol_8,sru_cannabis_8,sru_cocaine_8,sru_amphetamine_8,sru_methamphetamine_8,sru_opiates_8,sru_benzodiazepines_8,sru_propoxyphene_8,sru_methadone_8,sru_oxycodone_8,sru_other_8,sru_alcohol_12,sru_cannabis_12,sru_cocaine_12,sru_amphetamine_12,sru_methamphetamine_12,sru_opiates_12,sru_benzodiazepines_12,sru_propoxyphene_12,sru_methadone_12,sru_oxycodone_12,sru_other_12,sru_alcohol_16,sru_cannabis_16,sru_cocaine_16,sru_amphetamine_16,sru_methamphetamine_16,sru_opiates_16,sru_benzodiazepines_16,sru_propoxyphene_16,sru_methadone_16,sru_oxycodone_16,sru_other_16,sru_alcohol_20,sru_cannabis_20,sru_cocaine_20,sru_amphetamine_20,sru_methamphetamine_20,sru_opiates_20,sru_benzodiazepines_20,sru_propoxyphene_20,sru_methadone_20,sru_oxycodone_20,sru_other_20,sru_alcohol_24,sru_cannabis_24,sru_cocaine_24,sru_amphetamine_24,sru_methamphetamine_24,sru_benzodiazepines_24,sru_propoxyphene_24,sru_methadone_24,sru_oxycodone_24,sru_other_24,Unnamed: 0_y,t_Opiate300_0,t_Opiate300_1,t_Opiate300_2,t_Opiate300_3,t_Opiate300_4,t_Opiate300_5,t_Opiate300_6,t_Opiate300_7,t_Opiate300_8,t_Opiate300_9,t_Opiate300_10,t_Opiate300_11,t_Opiate300_12,t_Opiate300_13,t_Opiate300_14,t_Opiate300_15,t_Opiate300_16,t_Opiate300_17,t_Opiate300_18,t_Opiate300_19,t_Opiate300_20,outcome
0,2.0,24,0.0,160.0,0.0,320.0,0.0,192.0,0.0,384.0,0.0,96.0,0.0,96.0,0.0,352.0,0.0,128.0,0.0,256.0,0.0,256.0,0.0,224.0,0.0,448.0,0.0,32.0,0.0,224.0,0.0,224.0,0.0,240.0,0.0,182.0,0.0,182.0,0.0,240.0,0.0,210.0,0.0,180.0,0.0,246.0,0.0,128.0,0.0,188.0,0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,1.0
1,2.0,24,0.0,64.0,0.0,68.0,0.0,84.0,0.0,60.0,0.0,108.0,0.0,84.0,0.0,96.0,0.0,36.0,0.0,96.0,0.0,88.0,0.0,112.0,0.0,104.0,0.0,56.0,0.0,88.0,0.0,160.0,0.0,80.0,0.0,72.0,0.0,56.0,0.0,56.0,0.0,56.0,0.0,80.0,0.0,84.0,0.0,84.0,0.0,68.0,1,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,1.0,24,170.0,0.0,350.0,0.0,420.0,0.0,420.0,0.0,540.0,0.0,310.0,0.0,455.0,0.0,455.0,0.0,480.0,0.0,600.0,0.0,455.0,0.0,560.0,0.0,800.0,0.0,600.0,0.0,360.0,0.0,640.0,0.0,700.0,0.0,700.0,0.0,800.0,0.0,600.0,0.0,765.0,0.0,630.0,0.0,510.0,0.0,715.0,0.0,2,0.0,0.0,23.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,9.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,8.0,0.0,0.0,28.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,1.0,26.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,2.0,24,0.0,248.0,0.0,256.0,0.0,160.0,0.0,96.0,0.0,416.0,0.0,256.0,0.0,224.0,0.0,224.0,0.0,224.0,0.0,224.0,0.0,224.0,0.0,320.0,0.0,160.0,0.0,256.0,0.0,160.0,0.0,320.0,0.0,128.0,0.0,256.0,0.0,192.0,0.0,448.0,0.0,64.0,0.0,160.0,0.0,192.0,0.0,96.0,3,0.0,1.0,2.0,0.0,0.0,30.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3,1.0,1.0,0.0,,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,2.0,2,0.0,16.0,0.0,16.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,25.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,5,1.0,,,,,,,,,,,,,,,,,,,,,0.0


In [13]:
# create list of columns that contain Propoxyphene
p = [col for col in t4.columns if 'Propoxyphene' in col]

In [14]:
# create list of columns that contain sru_other
s = [col for col in t4.columns if 'sru_other' in col]

In [15]:
# merge t and s
to_drop = [*p, *s]

In [16]:
# drop columns
t4.drop(to_drop, axis=1, inplace=True)

In [17]:
t4.shape

(1306, 257)

In [18]:
list(t4.columns)

['metha_dose_visit_0',
 'bupe_dose_visit_0',
 'metha_dose_visit_1',
 'bupe_dose_visit_1',
 'metha_dose_visit_2',
 'bupe_dose_visit_2',
 'metha_dose_visit_3',
 'bupe_dose_visit_3',
 'metha_dose_visit_4',
 'bupe_dose_visit_4',
 'metha_dose_visit_5',
 'bupe_dose_visit_5',
 'metha_dose_visit_6',
 'bupe_dose_visit_6',
 'metha_dose_visit_7',
 'bupe_dose_visit_7',
 'metha_dose_visit_8',
 'bupe_dose_visit_8',
 'metha_dose_visit_9',
 'bupe_dose_visit_9',
 'metha_dose_visit_10',
 'bupe_dose_visit_10',
 'metha_dose_visit_11',
 'bupe_dose_visit_11',
 'metha_dose_visit_12',
 'bupe_dose_visit_12',
 'metha_dose_visit_13',
 'bupe_dose_visit_13',
 'metha_dose_visit_14',
 'bupe_dose_visit_14',
 'metha_dose_visit_15',
 'bupe_dose_visit_15',
 'metha_dose_visit_16',
 'bupe_dose_visit_16',
 'metha_dose_visit_17',
 'bupe_dose_visit_17',
 'metha_dose_visit_18',
 'bupe_dose_visit_18',
 'metha_dose_visit_19',
 'bupe_dose_visit_19',
 'metha_dose_visit_20',
 'bupe_dose_visit_20',
 'metha_dose_visit_21',
 'bupe_do

In [22]:
t4.metha_dose_visit_1.mean(),t4.metha_dose_visit_2.mean()

(134.67457886676877, 171.34073506891272)

In [19]:
# data is ready for machine learning

t4.to_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/machine_learning_data/final.csv', index=False)