# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
import evalml



In [4]:
from evalml.data_checks import DefaultDataChecks
from evalml.automl import AutoMLSearch

# Read Datasets

In [5]:
# Prepare training data
# Dataframe to DataTable conversion to treat columns with the same physical data type differently
df = pd.read_csv('train.csv').dropna()
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, temp1, y_train, temp2 = evalml.preprocessing.split_data(X, y, 
                                                                   problem_type='regression',
                                                                   test_size=(1/df.shape[0])*100)
X_train[list(df.columns)[:-1]]

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gender,category,Categorical,['category']
age,Int64,Integer,['numeric']
number_of_kids,Int64,Integer,['numeric']
day,Int64,Integer,['numeric']


In [7]:
# Prepare testing data
df2 = pd.read_csv('test.csv').dropna()
X = df2.iloc[:,:-1]
y = df2.iloc[:,-1]
X_test, temp1, y_test, temp2 = evalml.preprocessing.split_data(X, y, 
                                                                   problem_type='regression',
                                                                   test_size=(1/df2.shape[0])*100)
X_test[list(df2.columns)[:-1]]

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gender,category,Categorical,['category']
age,Int64,Integer,['numeric']
number_of_kids,Int64,Integer,['numeric']
day,Int64,Integer,['numeric']


# Training

In [6]:
data_checks = DefaultDataChecks("regression", "R2")
data_checks.validate(X_train, y_train)



In [7]:
automl = AutoMLSearch(X_train = X_train, 
                      y_train = y_train, 
                      problem_type = 'regression',
                      objective = "root mean squared error",
                      optimize_thresholds = True,
                      ensembling = True,
                      max_batches = 10
                 )
automl.search()

Generating pipelines to search over...
Ensembling will run every 9 batches.

*****************************
* Beginning pipeline search *
*****************************

Optimizing for Root Mean Squared Error. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 10 batches for a total of 50 pipelines. 
Allowed model families: lightgbm, extra_trees, xgboost, linear_model, random_forest, decision_tree, catboost



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Evaluating Baseline Pipeline: Mean Baseline Regression Pipeline
Mean Baseline Regression Pipeline:
	Starting cross validation
	Finished cross validation - mean Root Mean Squared Error: 2.820

*****************************
* Evaluating Batch Number 1 *
*****************************

Linear Regressor w/ Imputer + One Hot Encoder + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Root Mean Squared Error: 2.578
Decision Tree Regressor w/ Imputer + One Hot Encoder:
	Starting cross validation
	Finished cross validation - mean Root Mean Squared Error: 2.520
Random Forest Regressor w/ Imputer + One Hot Encoder:
	Starting cross validation
	Finished cross validation - mean Root Mean Squared Error: 2.506
LightGBM Regressor w/ Imputer + One Hot Encoder:
	Starting cross validation
	Finished cross validation - mean Root Mean Squared Error: 2.457
Elastic Net Regressor w/ Imputer + One Hot Encoder + Standard Scaler:
	Starting cross validation
	Finished cross validation - m


Search finished after 40:09            
Best pipeline: CatBoost Regressor w/ Imputer
Best pipeline Root Mean Squared Error: 2.420535


In [8]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])


*********************************
* CatBoost Regressor w/ Imputer *
*********************************

Problem Type: regression
Model Family: CatBoost

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
2. CatBoost Regressor
	 * n_estimators : 91
	 * eta : 0.47997769239788496
	 * max_depth : 5
	 * bootstrap_type : None
	 * silent : False
	 * allow_writing_files : False

Training
Training for regression problems.
Total training time (including CV): 1.4 seconds

Cross Validation
----------------
             Root Mean Squared Error  ExpVariance  MaxError  MedianAE   MSE   MAE    R2 # Training # Validation
0                              2.427        0.264     9.065     1.754 5.892 1.987 0.264     22,024       11,013
1                              2.409        0.264     9.207     1.743 5.805 1.964 0.263     22,025       11,012
2                              2.

# Testing

In [9]:
pipeline = automl.best_pipeline

In [10]:
pipeline.score(X_test, y_test, objectives=["root mean squared error"])

OrderedDict([('Root Mean Squared Error', 4.525456587506269)])