In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# read in the merged csv

f = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/merged.csv')

In [3]:
f[:5]

Unnamed: 0,patdeid,VISIT,medication,total_dose,admin_location,sru_alcohol,alc_qty,sru_cannabis,sru_cocaine,sru_amphetamine,sru_methamphetamine,sru_opiates,sru_benzodiazepines,sru_propoxyphene,sru_methadone,sru_oxycodone,sru_other,t_alcohol,alc_result,urine_test,refuse_reason,other_reason,urine_temp,supervised,t_Amphetamines,t_Benzodiazapines,t_Methadone,t_Oxycodone,t_Cocaine,t_Methamphetamine,t_Opiate300,t_Cannabinoids,t_Propoxyphene
0,1,0,2.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,1,2.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,2,2.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,3,2.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,4,2.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Before we do anything else, we must create the target variable.  A patient must meet 2 conditions to have reached the desired outcomes, listed as follows:<br>

1.  Complete 24 weeks of treatment (this is validated by looking at the final 4 tests)<br>
2.  Show 4 consecutive clean urine tests for opiates for visits 21 -24

In [4]:
f['outcome'] = None

for i in f['patdeid'].unique():
    # if values for VISIT are in 23,22,23,24 and values in t_Opiate300 are 0,0,0,0
    if (f[(f['patdeid']==i) & (f['VISIT'].isin([21,22,23,24])) & (f['t_Opiate300']==0)].shape[0] == 4):
        # set outcome to 0
        f.loc[(f['patdeid']==i)&(f.VISIT==0), 'outcome'] = 1.0
    else:
         f.loc[(f['patdeid']==i)&(f.VISIT==0), 'outcome'] = 0.0

# replace NaN with 0
f['outcome'] = f['outcome'].replace(np.nan, 0.0)

In [5]:
f.outcome.value_counts()

outcome
0.0    23139
1.0      389
Name: count, dtype: int64

In [6]:
f.loc[f.VISIT==0].outcome.value_counts()

outcome
0.0    925
1.0    389
Name: count, dtype: int64

In [7]:
# create function to call random patient ID from patdeid column
def random_sample(df):
    return df.patdeid.sample(1).values[0]

In [11]:
# evaluate accuracy of target variable, outcome
f.loc[f.patdeid==random_sample(f),
['patdeid','VISIT','medication','total_dose','t_Opiate300','outcome']][:25]

Unnamed: 0,patdeid,VISIT,medication,total_dose,t_Opiate300,outcome
1718,147,0,1.0,30.0,1.0,1.0
1719,147,1,1.0,40.0,1.0,0.0
1720,147,2,1.0,60.0,1.0,0.0
1721,147,3,1.0,65.0,1.0,0.0
1722,147,4,1.0,75.0,1.0,0.0
1723,147,5,1.0,85.0,0.0,0.0
1724,147,6,1.0,85.0,1.0,0.0
1725,147,7,1.0,95.0,1.0,0.0
1726,147,8,1.0,105.0,1.0,0.0
1727,147,9,1.0,105.0,0.0,0.0


In [18]:
f.to_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/13sep23.csv', index=False)

In [None]:
# review distribution of total dose

plt.figure(figsize=(6,4))
f.total_dose.plot(kind='hist', bins=25)
plt.title('Distribution of Total Dose')
plt.xlabel('Total Dose')
plt.ylabel('Frequency')
plt.axvline(f.total_dose.mean(), color='black', linestyle='dashed', linewidth=2)
plt.axvline(f.total_dose.median(), color='orange', linestyle='dashed', linewidth=2)
plt.show()





Target variable 'outcome' successfully created<br>
<br>
Next we will need to differentiate between patients that completed 24 weeks of treatment and those that dropped out before 24 weeks<br>

In [None]:
# create new column 'c_t' for completed treatment
# for each unique value in patdeid column
# if count of values in visit column is 24 then c_t = 1 else 0

for i in f.patdeid.unique():
    if f.loc[f['patdeid'] == i, 'VISIT'].count() == 25:
        f.loc[f['patdeid'] == i, 'c_t'] = 1
    else:
        f.loc[f['patdeid'] == i, 'c_t'] = 0

In [None]:
# evaluate accuracy of c_t feature
f.loc[f.patdeid==random_sample(f),
['patdeid','VISIT','medication','total_dose','t_Opiate300','outcome','c_t']][:25]

In [None]:
f.c_t.value_counts(normalize=True)

In [None]:
# create pie chart for treatment outcomes

f.loc[f.c_t==1.0, 'outcome'].value_counts().plot(kind='pie',
                                              title='Treatment Outcome',
                                              figsize=(6,4),
                                                autopct='%1.1f%%',
                                                labels=['Negative Outcome','Positive Outcome'],
                                                colors=['#ff9999','#66b3ff'],
                                                textprops={'fontsize': 14}),plt.ylabel('');
                                                

In [None]:
# bar chart for opiate tests for patient population
f.groupby('VISIT').agg({'t_Opiate300':'sum'}).plot(kind='bar', 
                                                   figsize=(20,5), 
                                                   legend=False,
                                                   title='Positive Opiate Tests', 
                                                   xlabel='Week in Treatment', 
                                                   ylabel='Number of Positive Tests');

In [None]:
# lineplot comparing other positive tests to opiates

f.groupby('VISIT').agg({'t_alcohol':'sum',
                        't_Opiate300':'sum',
                        't_Cannabinoids':'sum'}).plot(kind='line', 
                                                   figsize=(15,5), 
                                                   legend=True,
                                                   title='Positive Opiate vs. Alcohol', 
                                                   xlabel='Week in Treatment', 
                                                   ylabel='Number of Positive Tests');

In [None]:
# every time you run this cell, you will get a random patient sample
# this is a snapshot of the patient's data for the 24 week period

f.loc[f.patdeid==random_sample(f)].groupby('VISIT').agg({'total_dose':'sum',
                                          't_Opiate300':'sum'}).plot(
                                            kind='barh', stacked=True, figsize=(8,6),
                                            title=(f'Patient {random_sample(f)} - Medication Dose vs. Positive Tests'),
                                            legend=False, xlabel='Medication Dose & Opioid Tests',
                                            ylabel='Week in Treatment');

In [None]:
# create 2 plots next to eachother

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,5))

# Patient 1
f.loc[f.patdeid==random_sample(f)].groupby('VISIT').agg({'total_dose':'sum',
                                          't_Opiate300':'sum'}).plot(
                                            kind='barh', stacked=True, 
                                            title=(f'Patient {random_sample(f)} - Medication Dose vs. Positive Tests'),
                                            legend=False, xlabel='Medication Dose & Opioid Tests',
                                            ylabel='Week in Treatment',ax=ax1);

# Patient 2                                                
f.loc[f.patdeid==random_sample(f)].groupby('VISIT').agg({'total_dose':'sum',
                                          't_Opiate300':'sum'}).plot(
                                            kind='barh', stacked=True, 
                                            title=(f'Patient {random_sample(f)} - Medication Dose vs. Positive Tests'),
                                            legend=False, xlabel='Medication Dose & Opioid Tests',
                                            ylabel='Week in Treatment',ax=ax2);



In [None]:
# plot 2 plots next to eachother

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(24,6))

# remove count from legend align to center

f.loc[f.c_t==1.0].outcome.value_counts().plot(kind='pie',
                                                title='Treatment Outcome',
                                                autopct='%1.1f%%',
                                                ylabel='',
                                                labels=['Negative Outcome','Positive Outcome'],
                                                colors=['#ff9999','#66b3ff'],
                                                textprops={'fontsize': 14},
                                                ax=ax1);



# plot lineplot for positive opiate tests for patients who completed treatment

f.groupby('VISIT').agg({'t_Opiate300':'sum'}).plot(kind='bar',  
                                                   legend=False,
                                                   title='Total Population - Positive Opioid Tests', 
                                                   xlabel='Week in Treatment', 
                                                   ylabel='Number of Positive Tests',
                                                   ax=ax2);


In [21]:
# save to csv for next step, one hot encoding

f.to_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/12SEP23.csv', index=False)