In [1]:
#import dependencies 
import evalml
from evalml import AutoMLSearch
from evalml.utils import infer_feature_types
import woodwork as ww
import pandas as pd

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# Pull in data
df = pd.read_csv("https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt", delimiter="\t")
# initializing woodwork
df.ww.init()


In [3]:
#checking the woodwork background 
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AGE,int64,Integer,['numeric']
SEX,int64,Integer,['numeric']
BMI,float64,Double,['numeric']
BP,float64,Double,['numeric']
S1,int64,Integer,['numeric']
S2,float64,Double,['numeric']
S3,float64,Double,['numeric']
S4,float64,Double,['numeric']
S5,float64,Double,['numeric']
S6,int64,Integer,['numeric']


In [4]:
X = df[["AGE","SEX","BMI","BP","S1","S2","S3","S4","S5","S6"]]
y = df["Y"]

In [5]:
X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(
    X,y, problem_type="regression", test_size=0.5
)

In [6]:
automl = AutoMLSearch(
    X_train=X_train,
    y_train=y_train,
    problem_type="regression",
    objective="R2",
    max_batches=3,
    verbose=False,
)

In [7]:
automl.search()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000255 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 212
[LightGBM] [Info] Number of data points in the train set: 147, number of used features: 5
[LightGBM] [Info] Start training from score 151.544218
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 218
[LightGBM] [Info] Number of data points in the train set: 147, number of used features: 5
[LightGBM] [Info] Start training from score 148.659864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 148, number of used features: 5
[LightGBM] [Info] Start training fr

{1: {'Random Forest Regressor w/ Imputer + RF Regressor Select From Model': 1.1830053329467773,
  'Total time of batch': 1.5119142532348633},
 2: {'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 0.38246750831604004,
  'XGBoost Regressor w/ Imputer + Select Columns Transformer': 0.33624839782714844,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 0.2045271396636963,
  'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.1880030632019043,
  'Total time of batch': 2.6196019649505615},
 3: {'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.3452754020690918,
  'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 2.2104899883270264,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 0.4421958923339844,
  'Total time of batch': 116.91970419883728}}

In [8]:
#rankings
automl.rankings


Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,5,Elastic Net Regressor w/ Imputer + Standard Sc...,5,0.518365,0.518365,0.068912,21475.142728,False,{'Imputer': {'categorical_impute_strategy': 'm...
43,107,LightGBM Regressor w/ Imputer + Select Columns...,107,0.489227,0.489227,0.088714,20273.608548,False,{'Imputer': {'categorical_impute_strategy': 'm...
56,121,Extra Trees Regressor w/ Imputer + Select Colu...,121,0.455414,0.455414,0.054512,18879.317758,False,{'Imputer': {'categorical_impute_strategy': 'm...
112,1,Random Forest Regressor w/ Imputer + RF Regres...,1,0.40167,0.40167,0.113686,16663.143432,False,{'Imputer': {'categorical_impute_strategy': 'm...
127,3,XGBoost Regressor w/ Imputer + Select Columns ...,3,0.290017,0.290017,0.156608,12059.03473,False,{'Imputer': {'categorical_impute_strategy': 'm...
139,0,Mean Baseline Regression Pipeline,0,-0.002425,-0.002425,0.001869,0.0,False,{'Baseline Regressor': {'strategy': 'mean'}}


In [None]:
#See more detail
automl.describe_pipeline(1)


In [None]:
# Select the best pipeline and score it on our holdout data
pipeline = automl.best_pipeline
pipeline.score(X_holdout, y_holdout, ["R2"])

In [None]:
# Create a sample X_new dataset (replace this with your actual data)
# Here, we create a new DataFrame with the same feature structure as X_train
X_new = pd.DataFrame({
    'feature1': [value1, value2, value3],  # Replace with your actual values
    'feature2': [value4, value5, value6],  # Replace with your actual values
    # Add other feature columns as needed
    })

# Make predictions on the new data
predictions = best_pipeline.predict(X_new)

# Print the predictions
print(predictions)