Installing packages

In [3]:
!pip install tpot
!pip install mljar-supervised
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tpot
  Downloading TPOT-0.11.7-py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 4.1 MB/s 
Collecting update-checker>=0.16
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting xgboost>=1.1.0
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[K     |████████████████████████████████| 255.9 MB 43 kB/s 
Collecting deap>=1.2
  Downloading deap-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (139 kB)
[K     |████████████████████████████████| 139 kB 46.8 MB/s 
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25l[?25hdone
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11954 sha256=8b85dfe6b33c58e7aa62879bfba64688a25bcb28061c55b7d

In [4]:
import pandas as pd
import tpot as tp
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML

Importing data

In [5]:
sparcs = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_sparcs.csv')
sparcs

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [6]:
sparcs.columns

Index(['Health Service Area', 'Hospital County',
       'Operating Certificate Number', 'Facility Id', 'Facility Name',
       'Age Group', 'Zip Code - 3 digits', 'Gender', 'Race', 'Ethnicity',
       'Length of Stay', 'Type of Admission', 'Patient Disposition',
       'Discharge Year', 'CCS Diagnosis Code', 'CCS Diagnosis Description',
       'CCS Procedure Code', 'CCS Procedure Description', 'APR DRG Code',
       'APR DRG Description', 'APR MDC Code', 'APR MDC Description',
       'APR Severity of Illness Code', 'APR Severity of Illness Description',
       'APR Risk of Mortality', 'APR Medical Surgical Description',
       'Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3',
       'Birth Weight', 'Abortion Edit Indicator',
       'Emergency Department Indicator', 'Total Charges', 'Total Costs'],
      dtype='object')

Determining possible important variables

In [7]:
sparcs['Length of Stay'].describe()

count     23583
unique       97
top           2
freq       5378
Name: Length of Stay, dtype: object

In [8]:
sparcs['Total Costs'].describe()

count    2.358300e+04
mean     1.472282e+04
std      2.718098e+04
min      6.700000e-01
25%      4.471700e+03
50%      8.320120e+03
75%      1.590874e+04
max      1.591541e+06
Name: Total Costs, dtype: float64

In [9]:
sparcs['APR Risk of Mortality'].value_counts()

Minor       13990
Moderate     4952
Major        3452
Extreme      1187
Name: APR Risk of Mortality, dtype: int64

# Developing a binary version of the sparcs data

In [10]:
sparcs['Length of Stay'] = pd.to_numeric(sparcs['Length of Stay'], errors='coerce')
sparcs['sparcs_los'] = sparcs['Length of Stay'].apply(lambda x: 'long' if x > 3 else 'short')
sparcs.drop('Length of Stay', axis=1, inplace=True)
sparcs['sparcs_los'].value_counts()


short    13008
long     10575
Name: sparcs_los, dtype: int64

Creating a binary classifier model 

In [11]:
X = sparcs.drop(columns=['sparcs_los'])
y = sparcs["sparcs_los"]

In [12]:
X

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [13]:
y

0        short
1        short
2        short
3        short
4         long
         ...  
23578     long
23579    short
23580     long
23581     long
23582     long
Name: sparcs_los, Length: 23583, dtype: object

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)


In [15]:
automl = AutoML(
    results_path="sparcs_binary_los",
    mode="Explain"
)

In [16]:
automl.fit(X_train, y_train)

Linear algorithm was disabled.
AutoML directory: sparcs_binary_los
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.687821 trained in 0.65 seconds




2_DecisionTree logloss 0.436436 trained in 14.94 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost logloss 0.259062 trained in 24.71 seconds
4_Default_NeuralNetwork logloss 0.362407 trained in 8.78 seconds
5_Default_RandomForest logloss 0.380921 trained in 23.27 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.259062 trained in 1.62 seconds
AutoML fit time: 86.72 seconds
AutoML best model: 3_Default_Xgboost


AutoML(results_path='sparcs_binary_los')

In [17]:
pred = automl.predict(X_test)
pred

array(['short', 'long', 'long', ..., 'long', 'long', 'short'],
      dtype=object)

In [18]:
automl.report()

Best model,name,model_type,metric_type,metric_value,train_time
,1_Baseline,Baseline,logloss,0.687821,1.67
,2_DecisionTree,Decision Tree,logloss,0.436436,16.45
the best,3_Default_Xgboost,Xgboost,logloss,0.259062,26.31
,4_Default_NeuralNetwork,Neural Network,logloss,0.362407,10.07
,5_Default_RandomForest,Random Forest,logloss,0.380921,24.85
,Ensemble,Ensemble,logloss,0.259062,1.62

Unnamed: 0,score,threshold
logloss,0.259062,
auc,0.958834,
f1,0.904165,0.452821
accuracy,0.893487,0.556852
precision,0.996491,0.994703
recall,1.0,0.000147622
mcc,0.785646,0.556852

Unnamed: 0,score,threshold
logloss,0.259062,
auc,0.958834,
f1,0.902181,0.556852
accuracy,0.893487,0.556852
precision,0.914141,0.556852
recall,0.890529,0.556852
mcc,0.785646,0.556852

Unnamed: 0,Predicted as long,Predicted as short
Labeled as long,1779,204
Labeled as short,267,2172

Unnamed: 0,score,threshold
logloss,0.436436,
auc,0.870434,
f1,0.825652,0.518668
accuracy,0.79896,0.518668
precision,0.791353,0.518668
recall,1.0,0.0630936
mcc,0.592415,0.518668

Unnamed: 0,score,threshold
logloss,0.436436,
auc,0.870434,
f1,0.825652,0.518668
accuracy,0.79896,0.518668
precision,0.791353,0.518668
recall,0.863059,0.518668
mcc,0.592415,0.518668

Unnamed: 0,Predicted as long,Predicted as short
Labeled as long,1428,555
Labeled as short,334,2105

Unnamed: 0,score,threshold
logloss,0.687821,
auc,0.5,
f1,0.710975,0.496442
accuracy,0.55156,0.496442
precision,0.55156,0.496442
recall,1.0,0.496442
mcc,0.0,0.496442

Unnamed: 0,score,threshold
logloss,0.687821,
auc,0.5,
f1,0.710975,0.496442
accuracy,0.55156,0.496442
precision,0.55156,0.496442
recall,1.0,0.496442
mcc,0.0,0.496442

Unnamed: 0,Predicted as long,Predicted as short
Labeled as long,0,1983
Labeled as short,0,2439

Unnamed: 0,score,threshold
logloss,0.380921,
auc,0.913052,
f1,0.860528,0.415301
accuracy,0.83763,0.424104
precision,0.996815,0.962424
recall,1.0,0.0459357
mcc,0.672817,0.415301

Unnamed: 0,score,threshold
logloss,0.380921,
auc,0.913052,
f1,0.859711,0.424104
accuracy,0.83763,0.424104
precision,0.821202,0.424104
recall,0.902009,0.424104
mcc,0.672165,0.424104

Unnamed: 0,Predicted as long,Predicted as short
Labeled as long,1504,479
Labeled as short,239,2200

Model,Weight
3_Default_Xgboost,5

Unnamed: 0,score,threshold
logloss,0.259062,
auc,0.958834,
f1,0.904165,0.452821
accuracy,0.893487,0.556852
precision,0.996491,0.994703
recall,1.0,0.000147622
mcc,0.785646,0.556852

Unnamed: 0,score,threshold
logloss,0.259062,
auc,0.958834,
f1,0.902181,0.556852
accuracy,0.893487,0.556852
precision,0.914141,0.556852
recall,0.890529,0.556852
mcc,0.785646,0.556852

Unnamed: 0,Predicted as long,Predicted as short
Labeled as long,1779,204
Labeled as short,267,2172

Unnamed: 0,score,threshold
logloss,0.362407,
auc,0.92855,
f1,0.870787,0.470751
accuracy,0.854817,0.49367
precision,0.982955,0.993535
recall,1.0,7.45466e-08
mcc,0.705898,0.500039

Unnamed: 0,score,threshold
logloss,0.362407,
auc,0.92855,
f1,0.870512,0.49367
accuracy,0.854817,0.49367
precision,0.856689,0.49367
recall,0.884789,0.49367
mcc,0.705878,0.49367

Unnamed: 0,Predicted as long,Predicted as short
Labeled as long,1622,361
Labeled as short,281,2158


Testing new data

In [19]:
automl_sparcs_los = AutoML(results_path="sparcs_binary_los")


In [20]:
X_withlos = sparcs.sample(5)
X_withoutlos = X_withlos.drop(columns=['sparcs_los'])

In [21]:
predict = automl.predict(X_withoutlos)
predict

array(['long', 'long', 'long', 'short', 'long'], dtype=object)

In [22]:
values_actual = X_withlos['sparcs_los'].values.tolist()
values_predicted = predict.tolist()
output = pd.DataFrame({'actual': values_actual, 'predicted': values_predicted})
output

Unnamed: 0,actual,predicted
0,long,long
1,long,long
2,long,long
3,short,short
4,long,long


# Binary Classifier Example

In [23]:
heart = pd.read_csv('https://raw.githubusercontent.com/csanicola74/AutoML-examples/main/data/heart.csv')
heart

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


etc

In [24]:
heart.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')

In [32]:
heart['age'].describe()

count    303.000000
mean      54.366337
std        9.082101
min       29.000000
25%       47.500000
50%       55.000000
75%       61.000000
max       77.000000
Name: age, dtype: float64

In [33]:
heart['thalachh'].describe()

count    303.000000
mean     149.646865
std       22.905161
min       71.000000
25%      133.500000
50%      153.000000
75%      166.000000
max      202.000000
Name: thalachh, dtype: float64

In [34]:
heart['trtbps'].describe()

count    303.000000
mean     131.623762
std       17.538143
min       94.000000
25%      120.000000
50%      130.000000
75%      140.000000
max      200.000000
Name: trtbps, dtype: float64

In [35]:
heart['chol'].describe()

count    303.000000
mean     246.264026
std       51.830751
min      126.000000
25%      211.000000
50%      240.000000
75%      274.500000
max      564.000000
Name: chol, dtype: float64

In [36]:
X_without_chol = heart.drop(columns=['chol'])
y_chol = heart['chol']

In [37]:
X_without_chol

Unnamed: 0,age,sex,cp,trtbps,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,0,1,115,1,1.2,1,1,3,0


In [38]:
y_chol

0      233
1      250
2      204
3      236
4      354
      ... 
298    241
299    264
300    193
301    131
302    236
Name: chol, Length: 303, dtype: int64

In [39]:
automl_2 = AutoML()
automl_2.fit(X_without_chol,y_chol)

AutoML directory: AutoML_2
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 3 models
1_Baseline rmse 40.458838 trained in 0.66 seconds
2_DecisionTree rmse 46.646306 trained in 11.47 seconds
3_Linear rmse 40.094017 trained in 2.92 seconds
* Step default_algorithms will try to check up to 3 models
4_Default_Xgboost rmse 43.4703 trained in 4.7 seconds
5_Default_NeuralNetwork rmse 36.975542 trained in 0.88 seconds
6_Default_RandomForest rmse 37.908018 trained in 8.57 seconds
* Step ensemble will try to check up to 1 model
Ensemble rmse 36.759846 trained in 0.31 seconds
AutoML fit time: 37.93 seconds
AutoML best model: Ensemble


AutoML()

In [40]:
automl_2.report()

Best model,name,model_type,metric_type,metric_value,train_time
,1_Baseline,Baseline,rmse,40.4588,1.0
,2_DecisionTree,Decision Tree,rmse,46.6463,12.23
,3_Linear,Linear,rmse,40.094,3.68
,4_Default_Xgboost,Xgboost,rmse,43.4703,5.54
,5_Default_NeuralNetwork,Neural Network,rmse,36.9755,1.69
,6_Default_RandomForest,Random Forest,rmse,37.908,9.4
the best,Ensemble,Ensemble,rmse,36.7598,0.31

Metric,Score
MAE,35.4884
MSE,1889.67
RMSE,43.4703
R2,-0.180123
MAPE,0.157708

Metric,Score
MAE,31.3677
MSE,1607.53
RMSE,40.094
R2,-0.00392512
MAPE,0.136148

feature,Learner_1
thall,0.169459
age,0.162651
exng,0.161141
thalachh,0.114071
slp,0.0711428
fbs,0.0570083
trtbps,0.048487
caa,0.00736213
oldpeak,-0.0292982
cp,-0.0378626

Metric,Score
MAE,38.0484
MSE,2175.88
RMSE,46.6463
R2,-0.358866
MAPE,0.162539

Metric,Score
MAE,29.9102
MSE,1367.19
RMSE,36.9755
R2,0.14617
MAPE,0.13152

Metric,Score
MAE,33.7845
MSE,1636.92
RMSE,40.4588
R2,-0.0222779
MAPE,0.148408

Model,Weight
3_Linear,1
5_Default_NeuralNetwork,4
6_Default_RandomForest,1

Metric,Score
MAE,29.6805
MSE,1351.29
RMSE,36.7598
R2,0.156103
MAPE,0.130562

Metric,Score
MAE,31.2722
MSE,1437.02
RMSE,37.908
R2,0.102562
MAPE,0.137837


In [41]:
heart['reg_pred'] = automl_2.predict(X_without_chol)

In [42]:
print('reg_pred')
print(heart[['chol', 'reg_pred']].head())

reg_pred
   chol    reg_pred
0   233  242.030588
1   250  233.076180
2   204  256.958405
3   236  236.239116
4   354  253.012820


# Porting files to a .zip

In [43]:
!zip -r /content/download.zip /content/

  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/.last_update_check.json (deflated 22%)
  adding: content/.config/gce (stored 0%)
  adding: content/.config/active_config (stored 0%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2022.11.09/ (stored 0%)
  adding: content/.config/logs/2022.11.09/14.28.33.298481.log (deflated 53%)
  adding: content/.config/logs/2022.11.09/14.29.37.145223.log (deflated 55%)
  adding: content/.config/logs/2022.11.09/14.29.12.473847.log (deflated 54%)
  adding: content/.config/logs/2022.11.09/14.28.03.059919.log (deflated 91%)
  adding: content/.config/logs/2022.11.09/14.29.37.960408.log (deflated 53%)
  adding: content/.config/logs/2022.11.09/14.28.57.244063.log (deflated 86%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content/.con

In [44]:
from google.colab import files
files.download('/content/download.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>