[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Clinical-Informatics-Interest-Group/CLiC.notebooks/blob/main/notebooks/notebook2.ipynb)

# 2. Machine Learning with fast.ai

A Random Forest Classifier is an ensemble of Decision Trees, which predict the outcome by a majority vote. A forest is made up of individual trees. 

In [1]:
import pandas as pd
import numpy as np
import zipfile, requests, io
from pathlib import Path

In [2]:
def get_data():
    if Path('./dataset_diabetes/diabetic_data.csv').is_file():
        pass
    else:
        r = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip')
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall()
        
get_data() 

In [3]:
df = pd.read_csv('./dataset_diabetes/diabetic_data.csv', na_values=['?'])
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


# Getting the data into shape.
Before we can train the model, we want to make sure the dataset is free of missing values and the features we include are relevant to our research question.

### These columns contain unique identifiers we don't want the model to train on.
* encounter_id  
* patient_nbr

### These feature columns contain only 'No' values, so should be dropped.
* citoglipton  
* examide  

### These feature columns contain too many null values and will be dropped.
* weight  

### These feature columns are missing in about half the data. We will drop these, but come back to them later.
* payer_code  
* medical_specialty  

In [4]:
# There's probably a more programatical and pythonic way to do this, but this is all I know...
df.pop('citoglipton')
df.pop('examide')
df.pop('weight')
df.pop('payer_code')
df.pop('medical_specialty')
df.pop('encounter_id')
df.pop('patient_nbr')
df.head()


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
# Which features contain null values?
df.isnull().sum()

race                        2273
gender                         0
age                            0
admission_type_id              0
discharge_disposition_id       0
admission_source_id            0
time_in_hospital               0
num_lab_procedures             0
num_procedures                 0
num_medications                0
number_outpatient              0
number_emergency               0
number_inpatient               0
diag_1                        21
diag_2                       358
diag_3                      1423
number_diagnoses               0
max_glu_serum                  0
A1Cresult                      0
metformin                      0
repaglinide                    0
nateglinide                    0
chlorpropamide                 0
glimepiride                    0
acetohexamide                  0
glipizide                      0
glyburide                      0
tolbutamide                    0
pioglitazone                   0
rosiglitazone                  0
acarbose  

In [6]:
# Drop the samples that contain null values
df = df.dropna(axis=0)
df.isnull().sum()

race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol                    0
troglitazone                0
tolazamide

In [7]:
# Now that we removed all samples and features missing data
# let's verify each data type matches our assessment of
# which variables are categorical (objects or strings) versus nominal (integers)
df.dtypes

race                        object
gender                      object
age                         object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide                   object
tolbutamide                 object
pioglitazone        

In [8]:
# Age is considered nominal and the authors list it by decade,
# although I think this makes it numerical. We'll just leave it for now...
#np.unique(df["age"])

In [9]:
# To simplify our problem, let's make the outcome binary.
# We are going to ask our model to predict whether pateints
# discharged from the hospital are readmitted within 30 days.
df = df.replace({'readmitted': {'>30': 0,  'NO': 0, '<30': 1}})
np.unique(df["readmitted"])


array([0, 1])

In [10]:
### Our first machine learning model will be a Random Forest classifier ###
# Python best practices would have us import all the libraries we need at the
# beginning. For clarity I am importing libraries as we use them so it's
# easier to see where they'll be used.

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# The model is defined here. We'll use the "fit" method to train it with
# a training "X" subset of the overall data. Then we'll "predict" outcomes
# and verify its performance against a "test" subset.
rforest = RandomForestClassifier(criterion='gini',
                                n_estimators=25,
                                random_state=1,
                                n_jobs=6)

# "X" DataFrame for all the features of each sample
X = pd.get_dummies(df.iloc[:, :42], drop_first=True)
# "y" DataFrame for the target outcome "readmitted"
y = df.readmitted
# Split "X" and "y" equivalently so that 80% of the samples
# are used to train the model, and 20% are reserved for testing
# its performance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)
# Train the model
rforest.fit(X_train, y_train)
# Test it
y_pred = rforest.predict(X_test)
print('Test Accuracy: %.3f' % rforest.score(X_test, y_test))

Test Accuracy: 0.888


## Progress Summary
Up to this point we have implemented a Machine Learning model to predict hospital readmissions. We have left out a few Machine Learning best practices to demonstrate how easy this is to do. As we make progress in the Hackathon we hope to gain an appreciation for how best practices can help us to make better and more representative models. A machine learning model can perform well against an arbitrary metric, but not be representative of the data or people who make up that date in the real world.