In [2]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd 
from apyori import apriori

In [4]:
main_dataset = pd.read_csv("dataset.csv")
main_dataset.drop(main_dataset.columns[0], axis=1, inplace=True)
main_dataset

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted
0,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,Pediatrics-Endocrinology,...,No,Up,No,No,No,No,No,Ch,Yes,>30
1,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,Cardiology,...,No,No,No,No,No,No,No,No,Yes,NO
2,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,InternalMedicine,...,No,Up,No,No,No,No,No,Ch,Yes,NO
3,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,InternalMedicine,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
4,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,Radiology,...,No,Steady,No,No,No,No,No,No,Yes,>30
5,55842,84259809,Caucasian,Male,[60-70),3,1,2,4,Surgery-Cardiovascular/Thoracic,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
6,63768,114882984,Caucasian,Male,[70-80),1,1,7,5,InternalMedicine,...,No,No,No,No,No,No,No,No,Yes,>30
7,12522,48330783,Caucasian,Female,[80-90),2,1,4,13,Cardiology,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
8,15738,63555939,Caucasian,Female,[90-100),3,3,4,12,InternalMedicine,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,28236,89869032,AfricanAmerican,Female,[40-50),1,1,7,9,Surgery-General,...,No,Steady,No,No,No,No,No,No,Yes,>30


## Split into different races

In [5]:
asian_data = main_dataset.loc[main_dataset['race'] == 'Asian']
white_data = main_dataset.loc[main_dataset['race'] == 'Caucasian']
minorities = ['AfrianAmerican','Hispanic','Other']
minor_data = main_dataset.loc[main_dataset['race'].isin(minorities)]

## Percentage of ppl who have diabetes medicine prescribed

In [6]:
prescribed_med = main_dataset['diabetesMed'].value_counts(normalize=True)
prescribed_med

Yes    0.768472
No     0.231528
Name: diabetesMed, dtype: float64

## Combination of drugs that reduce chance of readmission

In [7]:
# not readmitted
no_readmin = main_dataset.loc[main_dataset['readmitted'] == 'NO']
no_readmin

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted
1,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,Cardiology,...,No,No,No,No,No,No,No,No,Yes,NO
2,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,InternalMedicine,...,No,Up,No,No,No,No,No,Ch,Yes,NO
3,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,InternalMedicine,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,55842,84259809,Caucasian,Male,[60-70),3,1,2,4,Surgery-Cardiovascular/Thoracic,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,12522,48330783,Caucasian,Female,[80-90),2,1,4,13,Cardiology,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
8,15738,63555939,Caucasian,Female,[90-100),3,3,4,12,InternalMedicine,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
12,42570,77586282,Caucasian,Male,[80-90),1,6,7,10,Family/GeneralPractice,...,No,Steady,No,No,No,No,No,No,Yes,NO
14,73578,86328819,AfricanAmerican,Male,[60-70),1,3,7,12,Pulmonology,...,No,Up,No,No,No,No,No,Ch,Yes,NO
16,84222,108662661,Caucasian,Female,[50-60),1,1,7,3,Cardiology,...,No,No,No,No,No,No,No,No,Yes,NO
18,182796,63000108,AfricanAmerican,Female,[70-80),2,1,4,2,Family/GeneralPractice,...,No,No,No,No,No,No,No,No,No,NO


In [8]:
# percentage of no_readmin who took drugs
no_readmin_prescribed = no_readmin['diabetesMed'].value_counts(normalize=True)
no_readmin_prescribed

Yes    0.743934
No     0.256066
Name: diabetesMed, dtype: float64

In [9]:
# cleaned dataset of no_readmin
med_cols = ['metformin','repaglinide','nateglinide','chlorpropamide',\
            'glimepiride','acetohexamide','glipizide','glyburide','tolbutamide',\
           'pioglitazone','rosiglitazone','acarbose','miglitol','troglitazone',\
           'tolazamide','examide','citoglipton','insulin','glyburide.metformin',\
           'glipizide.metformin','glimepiride.pioglitazone','metformin.rosiglitazone',\
           'metformin.pioglitazone']
no_readmin_cleaned = no_readmin[med_cols]
no_readmin_cleaned

Unnamed: 0,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,...,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone
1,No,No,No,No,No,No,Steady,No,No,No,...,No,No,No,No,No,No,No,No,No,No
2,No,No,No,No,No,No,No,No,No,No,...,No,No,No,No,Up,No,No,No,No,No
3,No,No,No,No,No,No,Steady,No,No,No,...,No,No,No,No,Steady,No,No,No,No,No
5,Steady,No,No,No,Steady,No,No,No,No,No,...,No,No,No,No,Steady,No,No,No,No,No
7,No,No,No,No,No,No,Steady,No,No,No,...,No,No,No,No,Steady,No,No,No,No,No
8,No,No,No,No,No,No,No,No,No,No,...,No,No,No,No,Steady,No,No,No,No,No
12,No,No,No,No,No,No,No,No,No,No,...,No,No,No,No,Steady,No,No,No,No,No
14,No,No,No,No,No,No,No,No,No,No,...,No,No,No,No,Up,No,No,No,No,No
16,No,No,No,No,No,No,No,Steady,No,No,...,No,No,No,No,No,No,No,No,No,No
18,No,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No


In [10]:
for col in no_readmin_cleaned:
    print(col)
    for index, row in no_readmin_cleaned.iterrows():
        if row[col] == 'Steady' or row[col] == 'Up' or row[col] == 'Down':
            row[col] = col
    print('iterated through whole col')
    
print('new dataset created')

metformin
iterated through whole col
repaglinide
iterated through whole col
nateglinide
iterated through whole col
chlorpropamide
iterated through whole col
glimepiride
iterated through whole col
acetohexamide
iterated through whole col
glipizide
iterated through whole col
glyburide
iterated through whole col
tolbutamide
iterated through whole col
pioglitazone
iterated through whole col
rosiglitazone
iterated through whole col
acarbose
iterated through whole col
miglitol
iterated through whole col
troglitazone
iterated through whole col
tolazamide
iterated through whole col
examide
iterated through whole col
citoglipton
iterated through whole col
insulin
iterated through whole col
glyburide.metformin
iterated through whole col
glipizide.metformin
iterated through whole col
glimepiride.pioglitazone
iterated through whole col
metformin.rosiglitazone
iterated through whole col
metformin.pioglitazone
iterated through whole col
new dataset created


In [11]:
# replace all no values with nan
no_readmin_cleaned = no_readmin_cleaned.replace('No',np.nan)
no_readmin_cleaned

Unnamed: 0,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,...,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone
1,,,,,,,glipizide,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,insulin,,,,,
3,,,,,,,glipizide,,,,...,,,,,insulin,,,,,
5,metformin,,,,glimepiride,,,,,,...,,,,,insulin,,,,,
7,,,,,,,glipizide,,,,...,,,,,insulin,,,,,
8,,,,,,,,,,,...,,,,,insulin,,,,,
12,,,,,,,,,,,...,,,,,insulin,,,,,
14,,,,,,,,,,,...,,,,,insulin,,,,,
16,,,,,,,,glyburide,,,...,,,,,,,,,,
18,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# create records list for ARM algo
records = []  
for i in range(0, 52338):
    if i%1000==0:
        print(i,'rows completed')
    records.append([str(no_readmin_cleaned.values[i,j]) for j in range(0, 23)])

0 rows completed


In [None]:
print(len(records))