In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split #splitting the dataset
from autogluon.tabular import TabularDataset, TabularPredictor #to handle tabular data and train models
df=pd.read_csv('healthcare-dataset-stroke-data.csv') # path to colab notebook #replace this with the path to your dataset
# df = df.dropna() # drop NaN columns

In [4]:
# split into train and test sets
df_train,df_test=train_test_split(df,test_size=0.2,random_state=1)
df_train.shape,df_test.shape

((4088, 12), (1022, 12))

In [5]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


We need to drop the outcome column from the newly created test set

In [6]:
test_data=df_test.drop(['stroke'],axis=1)

test_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
4673,49833,Female,42.0,0,0,Yes,Govt_job,Rural,112.98,37.2,formerly smoked
3232,20375,Female,78.0,0,0,Yes,Private,Urban,78.29,30.1,formerly smoked
3694,39834,Male,28.0,0,0,No,Private,Urban,73.27,25.4,smokes
1070,42550,Female,81.0,0,0,Yes,Self-employed,Rural,246.34,21.1,never smoked
4163,19907,Female,52.0,0,0,Yes,Private,Rural,97.05,28.0,Unknown


Now, we build a predictor to train for classifying whether an individual with a given set of conditions will probably be at risk of a stroke. For this, we specify the outcome column as ‘stroke’ and ask the predictor to fit the algorithms on the train dataset. Arguments (optional) ‘verbosity=2’ will display all the steps the predictor is taking to arrive at the best model while ‘presets= best quality’ will ensure that the best model is selected from the trained ones. 

In [None]:
predictor= TabularPredictor(label ='stroke',problem_type='regression').fit(train_data = df_train, verbosity = 2,presets='best_quality')

Though we have not specified but AutoGluon perfectly understands that this is a binary classification problem based on the two unique labels ‘0’ & ‘1’ in the outcome column

In [12]:
# print a summary of the models it has trained 
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3  -0.193571       2.005770  313.201953                0.000000           0.282416            3       True         22
1          CatBoost_BAG_L2  -0.195461       1.182480  221.682738                0.022591          54.628872            2       True         16
2      WeightedEnsemble_L2  -0.195483       0.540746  106.942197                0.000000           0.323457            2       True         12
3   NeuralNetFastAI_BAG_L2  -0.195564       1.451685  198.660334                0.291796          31.606468            2       True         18
4          CatBoost_BAG_L1  -0.196341       0.015623   45.795909                0.015623          45.795909            1       True          6
5        LightGBMXT_BAG_L2  -0.196654       1.299813  183.612579                

{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
  'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
  'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestMSE_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesMSE_BAG_L1': 'StackerEnsembleModel_XT',
  'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
  'XGBoost_BAG_L1': 'StackerEnsembleModel_XGBoost',
  'NeuralNetTorch_BAG_L1': 'StackerEnsembleModel_TabularNeuralNetTorch',
  'LightGBMLarge_BAG_L1': 'StackerEnsembleModel_LGB',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel',
  'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L2': 'StackerEnsembleModel_LGB',
  'RandomForestMSE_BAG_L2': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L2': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesMSE_BAG_L2': 'StackerEnsembleModel_XT',
  'NeuralNetFastAI_BAG_L2': 'StackerEnsembleModel_NNFa

AutoGluon trained 24 models but we would be more interested to find out which is the best model as selected by AutoGluon. To display this, simply use the leaderboard() command which ranks the trained models in order. Use the leaderboard() command which ranks the trained models in order.

In [21]:
predictor.leaderboard(df_train, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,KNeighborsDist_BAG_L1,1.0,0.954393,0.020875,0.021773,0.0,0.020875,0.021773,0.0,1,True,2
1,RandomForestGini_BAG_L1,1.0,0.956826,0.140593,0.175899,0.751221,0.140593,0.175899,0.751221,1,True,5
2,RandomForestEntr_BAG_L1,1.0,0.956826,0.140593,0.171835,0.729398,0.140593,0.171835,0.729398,1,True,6
3,ExtraTreesGini_BAG_L1,1.0,0.956826,0.16496,0.187457,0.567576,0.16496,0.187457,0.567576,1,True,8
4,ExtraTreesEntr_BAG_L1,1.0,0.957434,0.166432,0.17184,0.613594,0.166432,0.17184,0.613594,1,True,9
5,XGBoost_BAG_L1,0.96169,0.956522,0.234319,0.124995,10.493594,0.234319,0.124995,10.493594,1,True,11
6,WeightedEnsemble_L2,0.95865,0.958954,0.588205,0.218714,32.022558,0.0,0.0,1.911051,2,True,14
7,NeuralNetFastAI_BAG_L1,0.95865,0.958954,0.588205,0.218714,30.111507,0.588205,0.218714,30.111507,1,True,10
8,ExtraTreesEntr_BAG_L2,0.958346,0.95713,2.687465,1.784087,138.106078,0.169153,0.187468,0.629945,2,True,21
9,KNeighborsUnif_BAG_L1,0.958042,0.956826,0.013996,0.019769,0.0,0.013996,0.019769,0.0,1,True,1


Additionally, we can also check for the feature importance using

In [22]:
predictor.feature_importance(data=df_train)

Computing feature importance via permutation shuffling for 11 features using 3289 rows with 5 shuffle sets...
	137.53s	= Expected runtime (27.51s per shuffle set)
	78.02s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
id,0.0,0.0,0.5,5,0.0,0.0
gender,0.0,0.0,0.5,5,0.0,0.0
age,0.0,0.0,0.5,5,0.0,0.0
hypertension,0.0,0.0,0.5,5,0.0,0.0
heart_disease,0.0,0.0,0.5,5,0.0,0.0
ever_married,0.0,0.0,0.5,5,0.0,0.0
work_type,0.0,0.0,0.5,5,0.0,0.0
Residence_type,0.0,0.0,0.5,5,0.0,0.0
avg_glucose_level,0.0,0.0,0.5,5,0.0,0.0
bmi,0.0,0.0,0.5,5,0.0,0.0


### Note: 
Here we can see that it has identified age and bmi to be the most important factors in the prediction of the outcome.

### Next, we feed the test data to the classifier for prediction and we can store it in a DataFrame

In [28]:
y_pred = predictor.predict(test_data)
y_pred=pd.DataFrame(y_pred,columns=['stroke'])
y_pred #print the DataFrame

Unnamed: 0,stroke
2648,0
3441,0
4935,0
1316,0
3271,0
...,...
1026,0
2436,0
4458,0
1354,0


#### To understand the evaluation metric ‘accuracy’, let us print the details for it

In [None]:
predictor.evaluate(df_test)

## Summary
Data preprocessing and Feature Engineering were carried out by AutoGluon. The trained model includes cross-validation as well. So, we got the trained classifier at 95% accuracy with just two lines of code (for the classifier to train and predict). Now, that’s impressive! If it were a traditional ML model, we would be spending a long time completing the entire process including EDA, data cleaning as well as coding to set up multiple models. AutoGluon made this quite simple for us.