# Determine whether a patient admitted to an ICU has been diagnosed with Diabetes Mellitus in the first 24 hours of intenseive care.

In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Read csv file
diabetes_df = pd.read_csv("Resources/TrainingWiDS2021.csv")
diabetes_df

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,1,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,...,,,0,0,0,0,0,0,0,1
1,2,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,...,51.0,51.0,0,0,0,0,0,0,0,1
2,3,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,...,,,0,0,0,0,0,0,0,0
3,4,262220,118,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,...,337.0,337.0,0,0,0,0,0,0,0,0
4,5,201746,33,19.0,,0,Caucasian,M,188.0,,...,,,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130152,130153,164758,7,50.0,29.287256,0,Caucasian,M,175.3,Emergency Department,...,,,0,0,0,0,0,0,0,0
130153,130154,197653,7,79.0,29.653433,0,Caucasian,F,162.6,Direct Admit,...,,,0,0,0,0,0,0,0,0
130154,130155,219719,7,73.0,32.265371,0,African American,M,177.8,Emergency Department,...,163.0,163.0,0,0,0,0,0,0,0,1
130155,130156,222562,170,81.0,24.408579,0,Caucasian,M,185.4,Emergency Department,...,,,0,0,0,0,0,0,0,0


## Preprocessing the dataset

In [3]:
# Use columns the following relevant columns to predict diabetes mellitus
train_df = diabetes_df[['age', 'bmi', 'ethnicity', 'gender', 'height', 'weight', 'd1_glucose_max', 'd1_mbp_max', 'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis','diabetes_mellitus']]
train_df

Unnamed: 0,age,bmi,ethnicity,gender,height,weight,d1_glucose_max,d1_mbp_max,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,68.0,22.732803,Caucasian,M,180.3,73.9,168.0,89.0,0,0,0,0,0,0,0,1
1,77.0,27.421875,Caucasian,F,160.0,70.2,145.0,120.0,0,0,0,0,0,0,0,1
2,25.0,31.952749,Caucasian,F,172.7,95.3,,102.0,0,0,0,0,0,0,0,0
3,81.0,22.635548,Caucasian,F,165.1,61.7,185.0,84.0,0,0,0,0,0,0,0,0
4,19.0,,Caucasian,M,188.0,,,104.0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130152,50.0,29.287256,Caucasian,M,175.3,90.0,,120.0,0,0,0,0,0,0,0,0
130153,79.0,29.653433,Caucasian,F,162.6,78.4,139.0,125.0,0,0,0,0,0,0,0,0
130154,73.0,32.265371,African American,M,177.8,102.0,346.0,110.0,0,0,0,0,0,0,0,1
130155,81.0,24.408579,Caucasian,M,185.4,83.9,156.0,97.0,0,0,0,0,0,0,0,0


In [4]:
train_df.isnull().sum()

age                            4988
bmi                            4490
ethnicity                      1587
gender                           66
height                         2077
weight                         3463
d1_glucose_max                 8243
d1_mbp_max                      327
aids                              0
cirrhosis                         0
hepatic_failure                   0
immunosuppression                 0
leukemia                          0
lymphoma                          0
solid_tumor_with_metastasis       0
diabetes_mellitus                 0
dtype: int64

In [5]:
# get value counts for 'gender'
train_df.gender.value_counts()

M    70518
F    59573
Name: gender, dtype: int64

In [6]:
# get value counts for 'ethnicity'
train_df.ethnicity.value_counts()

Caucasian           100236
African American     13911
Other/Unknown         6261
Hispanic              5049
Asian                 2198
Native American        915
Name: ethnicity, dtype: int64

In [7]:
# change NaN in 'ethnicity' to other/unknown
train_df['ethnicity'].fillna(value='Other/Unknown', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [8]:
# get value counts for 'ethnicity' again
train_df.ethnicity.value_counts()

Caucasian           100236
African American     13911
Other/Unknown         7848
Hispanic              5049
Asian                 2198
Native American        915
Name: ethnicity, dtype: int64

In [10]:
# drop all NaN values
train_df = train_df.dropna()

In [11]:
train_df.shape

(113363, 16)

In [12]:
# check correlation of features
train_df.corr()

Unnamed: 0,age,bmi,height,weight,d1_glucose_max,d1_mbp_max,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
age,1.0,-0.080928,-0.115533,-0.124069,0.013701,-0.002479,-0.02932,-0.02793,-0.02186,0.020193,0.027893,0.023003,0.025195,0.07521
bmi,-0.080928,1.0,-0.06863,0.878453,0.095824,0.062048,-0.02221,-0.0066,-0.003491,-0.035153,-0.014149,-0.010712,-0.044231,0.168091
height,-0.115533,-0.06863,1.0,0.379432,-0.015834,0.01989,0.008104,0.013455,0.009804,-0.001304,0.003618,0.003516,0.003007,-0.008533
weight,-0.124069,0.878453,0.379432,1.0,0.084191,0.069052,-0.019408,-0.000499,0.001514,-0.033847,-0.0126,-0.008496,-0.041014,0.153661
d1_glucose_max,0.013701,0.095824,-0.015834,0.084191,1.0,0.025267,-0.010185,-0.008046,-0.010001,-0.006905,-0.004431,-0.002438,-0.016167,0.401457
d1_mbp_max,-0.002479,0.062048,0.01989,0.069052,0.025267,1.0,0.006705,-0.02417,-0.030079,-0.014419,-0.010468,-0.01093,-0.021097,0.018093
aids,-0.02932,-0.02221,0.008104,-0.019408,-0.010185,0.006705,1.0,0.007725,0.002571,0.015766,-0.002914,0.009809,-0.00314,-0.011425
cirrhosis,-0.02793,-0.0066,0.013455,-0.000499,-0.008046,-0.02417,0.007725,1.0,0.544511,1.7e-05,-0.003672,-0.002569,-0.002767,0.011815
hepatic_failure,-0.02186,-0.003491,0.009804,0.001514,-0.010001,-0.030079,0.002571,0.544511,1.0,0.002937,-0.000272,-0.00239,0.005009,0.006611
immunosuppression,0.020193,-0.035153,-0.001304,-0.033847,-0.006905,-0.014419,0.015766,1.7e-05,0.002937,1.0,0.137938,0.1007,0.280764,-0.005516


In [14]:
# Binary encoding using Pandas (multiple columns)
train_encoded = pd.get_dummies(train_df, columns=["ethnicity", "gender"])
train_encoded

Unnamed: 0,age,bmi,height,weight,d1_glucose_max,d1_mbp_max,aids,cirrhosis,hepatic_failure,immunosuppression,...,solid_tumor_with_metastasis,diabetes_mellitus,ethnicity_African American,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown,gender_F,gender_M
0,68.0,22.732803,180.3,73.9,168.0,89.0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
1,77.0,27.421875,160.0,70.2,145.0,120.0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
3,81.0,22.635548,165.1,61.7,185.0,84.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
5,67.0,27.555611,190.5,100.0,156.0,127.0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
6,59.0,57.451002,165.1,156.6,197.0,117.0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130151,50.0,20.324301,165.1,55.4,182.0,99.0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
130153,79.0,29.653433,162.6,78.4,139.0,125.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
130154,73.0,32.265371,177.8,102.0,346.0,110.0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,1
130155,81.0,24.408579,185.4,83.9,156.0,97.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


## Split and Train data

In [15]:
# use X as features and y as target 
X = train_encoded.drop(columns="diabetes_mellitus")
y = train_encoded["diabetes_mellitus"]

# split data into 75% training and 25% testing 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.25,
                                                    random_state=1)
X_train.shape

(85022, 21)

### Logistic Regression model

In [16]:
# create a logistic regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=2000,
                                random_state=1)


In [18]:
# fit model with training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=2000, random_state=1)

In [19]:
# make prediction
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(50)


Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [20]:
# print accuracy score
print(accuracy_score(y_test, y_pred))

0.7883984333650894
