In [1]:
#import dependencies 
import evalml
from evalml import AutoMLSearch
from evalml.utils import infer_feature_types
import woodwork as ww
import pandas as pd

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# Pull in data
df = pd.read_csv("https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt", delimiter="\t")
# initializing woodwork
df.ww.init()


In [3]:
#checking the woodwork background 
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AGE,int64,Integer,['numeric']
SEX,int64,Integer,['numeric']
BMI,float64,Double,['numeric']
BP,float64,Double,['numeric']
S1,int64,Integer,['numeric']
S2,float64,Double,['numeric']
S3,float64,Double,['numeric']
S4,float64,Double,['numeric']
S5,float64,Double,['numeric']
S6,int64,Integer,['numeric']


In [4]:
X = df[["AGE","SEX","BMI","BP","S1","S2","S3","S4","S5","S6"]]
y = df["Y"]

In [5]:
X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(
    X,y, problem_type="regression", test_size=0.5
)

In [6]:
automl = AutoMLSearch(
    X_train=X_train,
    y_train=y_train,
    problem_type="regression",
    objective="R2",
    max_batches=3,
    verbose=False,
)

In [7]:
automl.search()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 212
[LightGBM] [Info] Number of data points in the train set: 147, number of used features: 5
[LightGBM] [Info] Start training from score 151.544218
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000074 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 218
[LightGBM] [Info] Number of data points in the train set: 147, number of used features: 5
[LightGBM] [Info] Start training from score 148.659864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 148, number of used features: 5
[LightGBM] [Info] Start training fr

{1: {'Random Forest Regressor w/ Imputer + RF Regressor Select From Model': 1.1365916728973389,
  'Total time of batch': 1.4710361957550049},
 2: {'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 0.3943450450897217,
  'XGBoost Regressor w/ Imputer + Select Columns Transformer': 0.32976388931274414,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 0.1967637538909912,
  'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.22441411018371582,
  'Total time of batch': 2.6641743183135986},
 3: {'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.9331040382385254,
  'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 4.855446100234985,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 1.14670991897583,
  'Total time of batch': 115.2239511013031}}

In [8]:
#rankings
automl.rankings


Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,5,Elastic Net Regressor w/ Imputer + Standard Sc...,5,0.518365,0.518365,0.068912,21475.142728,False,{'Imputer': {'categorical_impute_strategy': 'm...
43,107,LightGBM Regressor w/ Imputer + Select Columns...,107,0.489227,0.489227,0.088714,20273.608548,False,{'Imputer': {'categorical_impute_strategy': 'm...
56,121,Extra Trees Regressor w/ Imputer + Select Colu...,121,0.455414,0.455414,0.054512,18879.317758,False,{'Imputer': {'categorical_impute_strategy': 'm...
112,1,Random Forest Regressor w/ Imputer + RF Regres...,1,0.40167,0.40167,0.113686,16663.143432,False,{'Imputer': {'categorical_impute_strategy': 'm...
127,3,XGBoost Regressor w/ Imputer + Select Columns ...,3,0.290017,0.290017,0.156608,12059.03473,False,{'Imputer': {'categorical_impute_strategy': 'm...
139,0,Mean Baseline Regression Pipeline,0,-0.002425,-0.002425,0.001869,0.0,False,{'Baseline Regressor': {'strategy': 'mean'}}


In [9]:
#See more detail
automl.describe_pipeline(automl.rankings.iloc[0]["id"])



***********************************************************************************
* Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer *
***********************************************************************************

Problem Type: regression
Model Family: Linear

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
2. Standard Scaler
3. Select Columns Transformer
	 * columns : ['BMI', 'BP', 'S1', 'S2', 'S5']
4. Elastic Net Regressor
	 * alpha : 0.0001
	 * l1_ratio : 0.15
	 * max_iter : 1000

Training
Training for regression problems.
Total training time (including CV): 0.2 seconds

Cross Validation
----------------
               R2  ExpVariance  MaxError  MedianAE      MSE    MAE  Root Mean Squared Error # Training # Validation
0           0.521        0.523

In [10]:
# Select the best pipeline and score it on our holdout data
pipeline = automl.best_pipeline
pipeline.score(X_holdout, y_holdout,objectives=["R2"] )

OrderedDict([('R2', 0.4246045193098764)])

In [11]:
### This is code to try to improve the score from above.
### Import necessary libraries
import evalml
from evalml import AutoMLSearch
import woodwork as ww
import pandas as pd

# Pull in data
df = pd.read_csv("https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt", delimiter="\t")

# Initializing woodwork
df.ww.init()

# Checking the woodwork background
df.ww

# Define features (X) and target (y)
X = df[["AGE", "SEX", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6"]]
y = df["Y"]

# Split data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(
    X, y, problem_type="regression", test_size=0.2  # Use a smaller holdout size for more training data
)

# Initialize AutoML search
automl = AutoMLSearch(
    X_train=X_train,
    y_train=y_train,
    problem_type="regression",
    objective="R2",
    max_batches=3,
    optimize_thresholds=True,  # You can optimize thresholds for classification tasks
    verbose=False,
    random_seed=42,  # Set a random seed for reproducibility
)


In [12]:
# Perform AutoML search to find the best pipeline
automl.search()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 262
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 5
[LightGBM] [Info] Start training from score 154.170213
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000074 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 258
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 5
[LightGBM] [Info] Start training from score 155.229787
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263
[LightGBM] [Info] Number of data points in the train set: 236, number of used features: 5
[LightGBM] [Info] Start training fr

{1: {'Random Forest Regressor w/ Imputer + RF Regressor Select From Model': 1.1349549293518066,
  'Total time of batch': 1.467993974685669},
 2: {'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 0.4243354797363281,
  'XGBoost Regressor w/ Imputer + Select Columns Transformer': 0.3467841148376465,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 0.18872857093811035,
  'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.2127666473388672,
  'Total time of batch': 2.7142741680145264},
 3: {'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.4167017936706543,
  'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 0.9178116321563721,
  'Random Forest Regressor w/ Imputer + Select Columns Transformer': 3.23726749420166,
  'Total time of batch': 201.09188413619995}}

In [13]:
# Display the pipeline rankings
automl.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,93,Elastic Net Regressor w/ Imputer + Standard Sc...,93,0.512648,0.512648,0.051043,1876.015718,False,{'Imputer': {'categorical_impute_strategy': 'm...
41,55,Extra Trees Regressor w/ Imputer + Select Colu...,55,0.49595,0.49595,0.05376,1818.166512,False,{'Imputer': {'categorical_impute_strategy': 'm...
86,71,Random Forest Regressor w/ Imputer + Select Co...,71,0.482107,0.482107,0.053982,1770.208792,False,{'Imputer': {'categorical_impute_strategy': 'm...
104,1,Random Forest Regressor w/ Imputer + RF Regres...,1,0.460708,0.460708,0.048699,1696.074867,False,{'Imputer': {'categorical_impute_strategy': 'm...
106,4,LightGBM Regressor w/ Imputer + Select Columns...,4,0.458727,0.458727,0.061707,1689.210129,False,{'Imputer': {'categorical_impute_strategy': 'm...
154,3,XGBoost Regressor w/ Imputer + Select Columns ...,3,0.337304,0.337304,0.096613,1268.555839,False,{'Imputer': {'categorical_impute_strategy': 'm...
155,0,Mean Baseline Regression Pipeline,0,-0.028865,-0.028865,0.024635,0.0,False,{'Baseline Regressor': {'strategy': 'mean'}}


In [14]:

# See more details of the best pipeline
best_pipeline_id = automl.rankings.iloc[0]["id"]
automl.describe_pipeline(best_pipeline_id)


***********************************************************************************
* Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer *
***********************************************************************************

Problem Type: regression
Model Family: Linear

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : knn
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
2. Standard Scaler
3. Select Columns Transformer
	 * columns : ['AGE', 'BMI', 'BP', 'S3', 'S5']
4. Elastic Net Regressor
	 * alpha : 0.03305073290054839
	 * l1_ratio : 0.34507124802668304
	 * max_iter : 1000

Training
Training for regression problems.
Total training time (including CV): 0.2 seconds

Cross Validation
----------------
               R2  ExpVariance  MaxError  MedianAE      MSE    MAE  Root Mean Squared Error # Training # Validation
0 

In [15]:

# Select the best pipeline and evaluate it using cross-validation
best_pipeline = automl.get_pipeline(best_pipeline_id)

In [16]:
# Fit the best pipeline on the full training data
best_pipeline.fit(X_train, y_train)

pipeline = RegressionPipeline(component_graph={'Imputer': ['Imputer', 'X', 'y'], 'Standard Scaler': ['Standard Scaler', 'Imputer.x', 'y'], 'Select Columns Transformer': ['Select Columns Transformer', 'Standard Scaler.x', 'y'], 'Elastic Net Regressor': ['Elastic Net Regressor', 'Select Columns Transformer.x', 'y']}, parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'knn', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, 'Select Columns Transformer':{'columns': ['AGE', 'BMI', 'BP', 'S3', 'S5']}, 'Elastic Net Regressor':{'alpha': 0.03305073290054839, 'l1_ratio': 0.34507124802668304, 'max_iter': 1000}}, random_seed=42)

In [17]:

# I had a worst score than when I started which mean I probably need more data than this.
# Will continue with other machine learning models with tensorflow or keras
# Score the best pipeline on the holdout data
holdout_score = best_pipeline.score(X_holdout, y_holdout, objectives=["R2"])
print("R-squared score on holdout data:", holdout_score["R2"])

R-squared score on holdout data: 0.327779918327495
