In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# read in the merged csv

f = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/merged.csv')

In [37]:
# check out the head of the df

f[:5]

Unnamed: 0,patdeid,VISIT,medication,total_dose,admin_location,sru_alcohol,alc_qty,sru_cannabis,sru_cocaine,sru_amphetamine,sru_methamphetamine,sru_opiates,sru_benzodiazepines,sru_propoxyphene,sru_methadone,sru_oxycodone,sru_other,t_alcohol,alc_result,urine_test,refuse_reason,other_reason,urine_temp,supervised,t_Amphetamines,t_Benzodiazapines,t_Methadone,t_Oxycodone,t_Cocaine,t_Methamphetamine,t_Opiate300,t_Cannabinoids,t_Propoxyphene,w_21,w_22,w_23,w_24,c_tests,outcome
0,1,0,2.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0,0,0,0.0
1,1,1,2.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0.0
2,1,2,2.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
3,1,3,2.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
4,1,4,2.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0.0


Before we do anything else, we must create the target variable.  A patient must meet 2 conditions to have reached the desired outcomes, listed as follows:<br>

1.  Complete 24 weeks of treatment (this is validated by looking at the final 4 tests)<br>
2.  Show 4 consecutive clean urine tests for opiates for visits 21 -24

In [4]:
# for rows in 'VISIT' column that equal 21,22,23,24 if 't_Opiate300' is 0, then 1, else 0

f['w_21'] = np.where((f['VISIT']==21) & (f['t_Opiate300']==0), 1, 0)
f['w_22'] = np.where((f['VISIT']==22) & (f['t_Opiate300']==0), 1, 0)
f['w_23'] = np.where((f['VISIT']==23) & (f['t_Opiate300']==0), 1, 0)
f['w_24'] = np.where((f['VISIT']==24) & (f['t_Opiate300']==0), 1, 0)

In [5]:
# create function that provides random sample from patdeid column, that takes df as input

def random_sample(df):
    return df.patdeid.sample(1).values[0]

In [6]:
# Run this cell to see a random patient snapshot of medication and opioid tests

f.loc[f.patdeid==random_sample(f),
['patdeid','VISIT','medication','total_dose','t_Opiate300',
'w_21','w_22','w_23','w_24']][:25]

Unnamed: 0,patdeid,VISIT,medication,total_dose,t_Opiate300,w_21,w_22,w_23,w_24
2770,227,0,1.0,30.0,1.0,0,0,0,0
2771,227,1,1.0,50.0,1.0,0,0,0,0
2772,227,2,1.0,70.0,0.0,0,0,0,0
2773,227,3,1.0,90.0,0.0,0,0,0,0
2774,227,4,1.0,90.0,1.0,0,0,0,0
2775,227,5,1.0,90.0,1.0,0,0,0,0
2776,227,6,1.0,90.0,1.0,0,0,0,0
2777,227,7,1.0,90.0,1.0,0,0,0,0
2778,227,8,1.0,90.0,1.0,0,0,0,0
2779,227,9,1.0,90.0,0.0,0,0,0,0


In [7]:
"""create new column 'outcome' that is the sum of w_21, w_22, w_23, w_24"""
f['c_tests'] = f['w_21'] + f['w_22'] + f['w_23'] + f['w_24']

In [8]:
f.loc[f.patdeid==random_sample(f),
['patdeid','VISIT','medication','total_dose','t_Opiate300',
'w_21','w_22','w_23','w_24','c_tests']][:25]

Unnamed: 0,patdeid,VISIT,medication,total_dose,t_Opiate300,w_21,w_22,w_23,w_24,c_tests
12230,1015,0,1.0,30.0,1.0,0,0,0,0,0
12231,1015,1,1.0,40.0,1.0,0,0,0,0,0
12232,1015,2,1.0,70.0,0.0,0,0,0,0,0
12233,1015,3,1.0,80.0,0.0,0,0,0,0,0
12234,1015,4,1.0,80.0,0.0,0,0,0,0,0
12235,1015,5,1.0,70.0,0.0,0,0,0,0,0
12236,1015,6,1.0,70.0,1.0,0,0,0,0,0
12237,1015,7,1.0,70.0,1.0,0,0,0,0,0
12238,1015,8,1.0,70.0,1.0,0,0,0,0,0
12239,1015,9,1.0,70.0,0.0,0,0,0,0,0


In [9]:
# for each unique patdeid, if sum of c_tests column is equal to 4
# then filter VISIT to 24 and create new column 'outcome' where the value is 1 else 0

for i in f.patdeid.unique():
    if f.loc[f.patdeid==i,'c_tests'].sum() == 4:
        f.loc[(f.patdeid==i) & (f.VISIT==24),'outcome'] = 1
    else:
        f.loc[(f.patdeid==i) & (f.VISIT==24),'outcome'] = 0

In [12]:
f.outcome.fillna(0, inplace=True)

In [36]:
f.loc[f.patdeid==random_sample(f),
['patdeid','VISIT','medication','total_dose','t_Opiate300',
'w_21','w_22','w_23','w_24','c_tests','outcome']][:25]

Unnamed: 0,patdeid,VISIT,medication,total_dose,t_Opiate300,w_21,w_22,w_23,w_24,c_tests,outcome
16092,1328,0,2.0,8.0,1.0,0,0,0,0,0,0.0
16093,1328,1,2.0,16.0,0.0,0,0,0,0,0,0.0
16094,1328,2,2.0,16.0,0.0,0,0,0,0,0,0.0
16095,1328,3,2.0,12.0,0.0,0,0,0,0,0,0.0
16096,1328,4,2.0,12.0,0.0,0,0,0,0,0,0.0
16097,1328,5,2.0,12.0,0.0,0,0,0,0,0,0.0
16098,1328,6,2.0,12.0,0.0,0,0,0,0,0,0.0
16099,1328,7,2.0,12.0,0.0,0,0,0,0,0,0.0
16100,1328,8,2.0,12.0,0.0,0,0,0,0,0,0.0
16101,1328,9,2.0,12.0,0.0,0,0,0,0,0,0.0


In [None]:
"""
create target column called 'target'
filter each unique patient id if sum of values in outcome colum equals 4 then 1 else 0
"""
f['target'] = f.groupby('patdeid')['outcome'].transform('sum')

In [None]:
f.loc[f.patdeid==random_sample(f),
['patdeid','VISIT','medication','total_dose','t_Opiate300',
'w_21_test','w_22_test','w_23_test','w_24_test','target']][:25]

In [None]:
f.groupby('VISIT').agg({'t_Opiate300':'sum'}).plot(kind='bar', 
                                                   figsize=(15,5), 
                                                   legend=False,
                                                   title='Positive Opiate Tests', 
                                                   xlabel='Week in Treatment', 
                                                   ylabel='Number of Positive Tests');

In [None]:
f.columns

In [None]:
f.groupby('VISIT').agg({'t_alcohol':'sum',
                        't_Opiate300':'sum',
                        't_Cannabinoids':'sum'}).plot(kind='line', 
                                                   figsize=(15,5), 
                                                   legend=True,
                                                   title='Positive Opiate vs. Alcohol', 
                                                   xlabel='Week in Treatment', 
                                                   ylabel='Number of Positive Tests');

In [None]:
f.loc[f.patdeid==717].groupby('VISIT').agg({'total_dose':'sum',
                                          't_Opiate300':'sum'}).plot(
                                            kind='barh', stacked=True, figsize=(6,5),
                                            title='Opiate Dose vs. Positive Tests',
                                            legend=False, xlabel='Medication Dose & Opioid Tests',
                                            ylabel='Week in Treatment');

In [None]:
# create 2 plots next to eachother

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5))

f.loc[f.patdeid==1875].groupby('VISIT').agg({'total_dose':'sum',
                                            't_Opiate300':'sum'}).plot(
                                            kind='barh', stacked=True,
                                            title='Patient 1875 - Med Dose and Positive Opioid Tests',
                                            ylabel='Week in Treatment',
                                            xlabel='Medication Dose',
                                            legend=False,ax=ax1);
                                            
f.groupby('VISIT').agg({'t_Opiate300':'sum'}).plot(kind='bar', 
                                                   figsize=(15,5), 
                                                   legend=False,
                                                   title='Total Population - Positive Opioid Tests', 
                                                   xlabel='Week in Treatment', 
                                                   ylabel='Number of Positive Tests',
                                                   ax=ax2);


In [None]:
# Run this cell to see a random patient snapshot of medication and opioid tests

f.loc[f.patdeid==random_sample(f),
['patdeid','VISIT','medication','total_dose','t_Opiate300']][:25]

In [None]:
"""
create column 'week_21_clean', if VISIT column is 24 & t_Opiate300 is 0, then 1, else 0
"""
f['week_21_clean'] = np.where((f.VISIT==21) & (f.t_Opiate300==0), 1, 0)
f['week_22_clean'] = np.where((f.VISIT==22) & (f.t_Opiate300==0), 1, 0)
f['week_23_clean'] = np.where((f.VISIT==23) & (f.t_Opiate300==0), 1, 0)
f['week_24_clean'] = np.where((f.VISIT==24) & (f.t_Opiate300==0), 1, 0)

In [None]:
f.columns

In [None]:
tests = f.loc[:,['patdeid', 'VISIT','t_Opiate300', 't_Cannabinoids','t_Propoxyphene', 'week_21_clean', 
         'week_22_clean', 'week_23_clean','week_24_clean']]

In [None]:
tests.loc[tests.VISIT==24][:25]

In [None]:
"""
f['treatment_outcome'] = if 'week_21_clean':'week_24_clean' = 1, then 1, else 0 
"""
f['treatment_outcome'] = np.where((f.week_21_clean==1) & (f.week_22_clean==1) & (f.week_23_clean==1) & (f.week_24_clean==1), 1, 0)

In [None]:
f.columns

In [None]:
# create list of tests for filtering in next cell
tests = [col for col in f.columns if 't_' in col]

# append list with patdeid and bring to first column position
tests.insert(0, 'patdeid')

# create boolean mask to filter 4 final weeks of treatment for filtering in next cell
week_21, week_22, week_23, week_24 = f.VISIT==21, f.VISIT==22, f.VISIT==23, f.VISIT==24

In [None]:
tests

In [None]:
# review trends for week3 21, 22, 23, 24 in the dataset; 
# what you're seeing is extraction for week 24, with a slice of all test columns, 

f.loc[week_21,tests][:25]

In [None]:
f.week_21_clean.value_counts()