In [19]:
#import dependencies 
import evalml
from evalml import AutoMLSearch
from evalml.utils import infer_feature_types
import woodwork as ww
import pandas as pd

In [20]:
# Pull in data
df = pd.read_csv("https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt", delimiter="\t")
# initializing woodwork
df.ww.init()


In [21]:
#checking the woodwork background 
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AGE,int64,Integer,['numeric']
SEX,int64,Integer,['numeric']
BMI,float64,Double,['numeric']
BP,float64,Double,['numeric']
S1,int64,Integer,['numeric']
S2,float64,Double,['numeric']
S3,float64,Double,['numeric']
S4,float64,Double,['numeric']
S5,float64,Double,['numeric']
S6,int64,Integer,['numeric']


In [22]:
X = df[["AGE","SEX","BMI","BP","S1","S2","S3","S4","S5","S6"]]
y = df["Y"]

In [23]:
X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(
    X,y, problem_type="regression", test_size=0.5
)

In [24]:
automl = AutoMLSearch(
    X_train=X_train,
    y_train=y_train,
    problem_type="regression",
    objective="R2",
    max_batches=3,
    verbose=False,
)

In [25]:
automl.search()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 212
[LightGBM] [Info] Number of data points in the train set: 147, number of used features: 5
[LightGBM] [Info] Start training from score 151.544218
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 218
[LightGBM] [Info] Number of data points in the train set: 147, number of used features: 5
[LightGBM] [Info] Start training from score 148.659864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 148, number of used features: 5
[LightGBM] [Info] Start training fr

{1: {'Random Forest Regressor w/ Imputer + RF Regressor Select From Model': 1.1015126705169678,
  'Total time of batch': 1.420851230621338},
 2: {'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 0.3909006118774414,
  'XGBoost Regressor w/ Imputer + Select Columns Transformer': 0.32942676544189453,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 0.19947242736816406,
  'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.1905670166015625,
  'Total time of batch': 2.572636842727661},
 3: {'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.33223986625671387,
  'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 1.9403011798858643,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 0.39708781242370605,
  'Total time of batch': 116.04694509506226}}

In [26]:
#rankings
automl.rankings


Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,5,Elastic Net Regressor w/ Imputer + Standard Sc...,5,0.518365,0.518365,0.068912,21475.142728,False,{'Imputer': {'categorical_impute_strategy': 'm...
43,107,LightGBM Regressor w/ Imputer + Select Columns...,107,0.489227,0.489227,0.088714,20273.608548,False,{'Imputer': {'categorical_impute_strategy': 'm...
56,121,Extra Trees Regressor w/ Imputer + Select Colu...,121,0.455414,0.455414,0.054512,18879.317758,False,{'Imputer': {'categorical_impute_strategy': 'm...
112,1,Random Forest Regressor w/ Imputer + RF Regres...,1,0.40167,0.40167,0.113686,16663.143432,False,{'Imputer': {'categorical_impute_strategy': 'm...
127,3,XGBoost Regressor w/ Imputer + Select Columns ...,3,0.290017,0.290017,0.156608,12059.03473,False,{'Imputer': {'categorical_impute_strategy': 'm...
139,0,Mean Baseline Regression Pipeline,0,-0.002425,-0.002425,0.001869,0.0,False,{'Baseline Regressor': {'strategy': 'mean'}}


In [27]:
#See more detail
automl.describe_pipeline(automl.rankings.iloc[0]["id"])



***********************************************************************************
* Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer *
***********************************************************************************

Problem Type: regression
Model Family: Linear

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
2. Standard Scaler
3. Select Columns Transformer
	 * columns : ['BMI', 'BP', 'S1', 'S2', 'S5']
4. Elastic Net Regressor
	 * alpha : 0.0001
	 * l1_ratio : 0.15
	 * max_iter : 1000

Training
Training for regression problems.
Total training time (including CV): 0.2 seconds

Cross Validation
----------------
               R2  ExpVariance  MaxError  MedianAE      MSE    MAE  Root Mean Squared Error # Training # Validation
0           0.521        0.523

In [28]:
# Select the best pipeline and score it on our holdout data
pipeline = automl.best_pipeline
pipeline.score(X_holdout, y_holdout,objectives=["R2"] )

OrderedDict([('R2', 0.4246045193098764)])

In [29]:
### This is code to try to improve the score from above.
### Import necessary libraries
import evalml
from evalml import AutoMLSearch
import woodwork as ww
import pandas as pd

# Pull in data
df = pd.read_csv("https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt", delimiter="\t")

# Initializing woodwork
df.ww.init()

# Checking the woodwork background
df.ww

# Define features (X) and target (y)
X = df[["AGE", "SEX", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6"]]
y = df["Y"]

# Split data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(
    X, y, problem_type="regression", test_size=0.2  # Use a smaller holdout size for more training data
)

# Initialize AutoML search
MAEautoml = AutoMLSearch(
    X_train=X_train,
    y_train=y_train,
    problem_type="regression",
    objective="MAE",
    max_batches=3,
    #optimize_thresholds=True,  # You can optimize thresholds for classification tasks
    verbose=False,
    random_seed=42,  # Set a random seed for reproducibility
)


In [30]:
# Perform AutoML search to find the best pipeline
MAEautoml.search()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 262
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 5
[LightGBM] [Info] Start training from score 154.170213
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 258
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 5
[LightGBM] [Info] Start training from score 155.229787
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263
[LightGBM] [Info] Number of data points in the train set: 236, number of used features: 5
[LightGBM] [Info] Start training fr

{1: {'Random Forest Regressor w/ Imputer + RF Regressor Select From Model': 1.0072526931762695,
  'Total time of batch': 1.3220946788787842},
 2: {'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 0.3746781349182129,
  'XGBoost Regressor w/ Imputer + Select Columns Transformer': 0.3188920021057129,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 0.18875837326049805,
  'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.189713716506958,
  'Total time of batch': 2.5149013996124268},
 3: {'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.34162425994873047,
  'Random Forest Regressor w/ Imputer + Select Columns Transformer': 2.845747947692871,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 0.4089336395263672,
  'Total time of batch': 136.5198438167572}}

In [31]:
# Display the pipeline rankings
MAEautoml.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,12,Elastic Net Regressor w/ Imputer + Standard Sc...,12,44.715489,44.715489,2.737351,34.018003,False,{'Imputer': {'categorical_impute_strategy': 'm...
31,50,LightGBM Regressor w/ Imputer + Select Columns...,50,45.591123,45.591123,2.654931,32.72592,False,{'Imputer': {'categorical_impute_strategy': 'm...
34,40,Random Forest Regressor w/ Imputer + Select Co...,40,45.670962,45.670962,2.418491,32.608111,False,{'Imputer': {'categorical_impute_strategy': 'm...
48,1,Random Forest Regressor w/ Imputer + RF Regres...,1,46.01936,46.01936,1.950507,32.094016,False,{'Imputer': {'categorical_impute_strategy': 'm...
110,2,Extra Trees Regressor w/ Imputer + Select Colu...,2,47.166381,47.166381,2.972472,30.401476,False,{'Imputer': {'categorical_impute_strategy': 'm...
124,3,XGBoost Regressor w/ Imputer + Select Columns ...,3,50.025294,50.025294,3.034669,26.182875,False,{'Imputer': {'categorical_impute_strategy': 'm...
146,0,Mean Baseline Regression Pipeline,0,67.769226,67.769226,3.190989,0.0,False,{'Baseline Regressor': {'strategy': 'mean'}}


In [32]:

# See more details of the best pipeline
best_pipeline_id2 = MAEautoml.rankings.iloc[0]["id"]
MAEautoml.describe_pipeline(best_pipeline_id2)


***********************************************************************************
* Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer *
***********************************************************************************

Problem Type: regression
Model Family: Linear

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : most_frequent
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
2. Standard Scaler
3. Select Columns Transformer
	 * columns : ['AGE', 'BMI', 'BP', 'S3', 'S5']
4. Elastic Net Regressor
	 * alpha : 0.0007787658410143285
	 * l1_ratio : 0.9922115592912177
	 * max_iter : 1000

Training
Training for regression problems.
Total training time (including CV): 0.2 seconds

Cross Validation


----------------
               MAE  ExpVariance  MaxError  MedianAE      MSE    R2  Root Mean Squared Error # Training # Validation
0           42.646        0.573   122.967    34.355 2759.083 0.572                   52.527        235          118
1           43.681        0.483   120.118    37.707 2892.894 0.483                   53.786        235          118
2           47.819        0.484   138.339    43.898 3166.604 0.482                   56.273        236          117
mean        44.715        0.513   127.141    38.653 2939.527 0.513                   54.195          -            -
std          2.737        0.052     9.802     4.841  207.724 0.051                    1.906          -            -
coef of var  0.061        0.101     0.077     0.125    0.071 0.100                    0.035          -            -


In [33]:

# Select the best pipeline and evaluate it using cross-validation
best_pipelineMAE = MAEautoml.get_pipeline(best_pipeline_id2)

In [34]:
# Fit the best pipeline on the full training data
best_pipelineMAE.fit(X_train, y_train)

pipeline = RegressionPipeline(component_graph={'Imputer': ['Imputer', 'X', 'y'], 'Standard Scaler': ['Standard Scaler', 'Imputer.x', 'y'], 'Select Columns Transformer': ['Select Columns Transformer', 'Standard Scaler.x', 'y'], 'Elastic Net Regressor': ['Elastic Net Regressor', 'Select Columns Transformer.x', 'y']}, parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, 'Select Columns Transformer':{'columns': ['AGE', 'BMI', 'BP', 'S3', 'S5']}, 'Elastic Net Regressor':{'alpha': 0.0007787658410143285, 'l1_ratio': 0.9922115592912177, 'max_iter': 1000}}, random_seed=42)

In [35]:

# I had a worst score than when I started which mean I probably need more data than this.
# Will continue with other machine learning models with tensorflow or keras
# Score the best pipeline on the holdout data
holdout_score = best_pipelineMAE.score(X_holdout, y_holdout, objectives=["MAE"])
print("MAE score on holdout data:", holdout_score["MAE"])

MAE score on holdout data: 46.81820437079997
