In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# read in the merged csv

f = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/merged.csv')

In [4]:
display(f.shape)
f[:5]

(23528, 23)

Unnamed: 0,patdeid,VISIT,medication,total_dose,sru_alcohol,sru_cannabis,sru_cocaine,sru_amphetamine,sru_methamphetamine,sru_opiates,sru_benzodiazepines,sru_methadone,sru_oxycodone,sru_other,t_Amphetamines,t_Benzodiazapines,t_Methadone,t_Oxycodone,t_Cocaine,t_Methamphetamine,t_Opiate300,t_Cannabinoids,t_Propoxyphene
0,1,0,2.0,8.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,1,2.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,2,2.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,3,2.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,4,2.0,32.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Before we do anything else, we must create the target variable.  A patient must meet 2 conditions to have reached the desired outcomes, listed as follows:<br>

1.  Complete 24 weeks of treatment (this is validated by looking at the final 4 tests)<br>
2.  Show 4 consecutive clean urine tests for opiates for visits 21 -24

In [5]:
f['outcome'] = None

for i in f['patdeid'].unique():
    # if values for VISIT are in 21,22,23,24 and values in t_Opiate300 are 0,0,0,0
    if (f[(f['patdeid']==i) & (f['VISIT'].isin([21,22,23,24])) & (f['t_Opiate300']==0)].shape[0] == 4):
        # set outcome to 0
        f.loc[(f['patdeid']==i)&(f.VISIT==0), 'outcome'] = 1.0
    else:
         f.loc[(f['patdeid']==i)&(f.VISIT==0), 'outcome'] = 0.0

# replace NaN with 0
f['outcome'] = f['outcome'].replace(np.nan, 0.0)

In [6]:
# analyze the value counts of outcome
f.outcome.value_counts()

outcome
0.0    23129
1.0      399
Name: count, dtype: int64

In [7]:
# analyze value counts filtered by visit 0 to show true outcome
f.loc[f.VISIT==0].outcome.value_counts()

outcome
0.0    915
1.0    399
Name: count, dtype: int64

In [8]:
# create function to call random patient ID from patdeid column
def random_sample(df):
    return df.patdeid.sample(1).values[0]

In [16]:
# evaluate accuracy of target variable, outcome
# every time you run the cell a new patient profile will persist
# sample multiple patient data to test for accuracy and identify patterns

f.loc[f.patdeid==random_sample(f),
['patdeid','VISIT','medication','total_dose','t_Opiate300','outcome']][:25]

Unnamed: 0,patdeid,VISIT,medication,total_dose,t_Opiate300,outcome
2911,237,0,2.0,8.0,1.0,0.0
2912,237,1,2.0,8.0,1.0,0.0
2913,237,2,2.0,12.0,1.0,0.0
2914,237,3,2.0,12.0,1.0,0.0
2915,237,4,2.0,12.0,1.0,0.0
2916,237,5,2.0,12.0,1.0,0.0
2917,237,6,2.0,12.0,1.0,0.0
2918,237,7,2.0,12.0,1.0,0.0
2919,237,8,2.0,12.0,1.0,0.0
2920,237,9,2.0,12.0,1.0,0.0


Target variable 'outcome' successfully created<br>
<br>
Next we will need to differentiate between patients that completed 24 weeks of treatment and those that dropped out before 24 weeks<br>

In [17]:
# save to csv for next step, one hot encoding

f.to_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/fe_clean.csv', index=False)