# Before Building Model

In [1]:
from IPython.display import clear_output
!pip install evalml --user
clear_output()

import numpy as np
import pandas as pd
import os,random,warnings
warnings.simplefilter('ignore')

import evalml
from evalml.automl import AutoMLSearch

TRAIN_PATH = "../input/widsdatathon2022/train.csv"
TEST_PATH = "../input/widsdatathon2022/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/widsdatathon2022/sample_solution.csv"
SUBMISSION_PATH = "submission.csv"

ID = "id"
TARGET = "site_eui"

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()

MODEL_PROBLEM_TYPE = "regression"
MODEL_MAX_BATCHES = 10
MODEL_OBJECTIVE='Root Mean Squared Error'
MODEL_OBJECTIVE_ADDITIONAL=['MSE', 'R2']
MODEL_SAVE_PATH = "house_price_evalml_model.pkl"

In [2]:
from evalml.objectives import get_core_objectives
from evalml.problem_types import ProblemTypes

for objective in get_core_objectives(ProblemTypes.REGRESSION):
    print(objective.name)

ExpVariance
MaxError
MedianAE
MSE
MAE
R2
Root Mean Squared Error


# Build Model

In [3]:
train = pd.read_csv(TRAIN_PATH)

X = train.drop([ID,TARGET],axis=1)
y= train[TARGET]

X_train, X_val, y_train, y_val = evalml.preprocessing.split_data(X, y, problem_type=MODEL_PROBLEM_TYPE)

automl = AutoMLSearch(
    X_train=X_train, 
    y_train=y_train, 
    problem_type=MODEL_PROBLEM_TYPE,
    objective=MODEL_OBJECTIVE,
    additional_objectives=MODEL_OBJECTIVE_ADDITIONAL,
    max_batches=MODEL_MAX_BATCHES,
    optimize_thresholds=True
)
automl.search()

display(automl.rankings)
automl.best_pipeline.save(MODEL_SAVE_PATH)
# check_model=automl.load('model.pkl')
automl.best_pipeline

Unnamed: 0,id,pipeline_name,search_order,mean_cv_score,standard_deviation_cv_score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,39,CatBoost Regressor w/ Imputer,39,45.191334,2.344803,45.191334,21.7579,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,9,XGBoost Regressor w/ Imputer + One Hot Encoder,9,45.519651,2.473636,45.519651,21.189468,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,20,Random Forest Regressor w/ Imputer + One Hot E...,20,46.806593,2.068575,46.806593,18.961318,False,{'Imputer': {'categorical_impute_strategy': 'm...
14,50,LightGBM Regressor w/ Imputer + One Hot Encoder,50,47.848479,2.392774,47.848479,17.157448,False,{'Imputer': {'categorical_impute_strategy': 'm...
23,7,Extra Trees Regressor w/ Imputer + One Hot Enc...,7,50.130623,2.2995,50.130623,13.206253,False,{'Imputer': {'categorical_impute_strategy': 'm...
26,6,Decision Tree Regressor w/ Imputer + One Hot E...,6,50.724177,2.269236,50.724177,12.178601,False,{'Imputer': {'categorical_impute_strategy': 'm...
27,1,Elastic Net Regressor w/ Imputer + One Hot Enc...,1,50.935984,2.222189,50.935984,11.811889,False,{'Imputer': {'categorical_impute_strategy': 'm...
52,0,Mean Baseline Regression Pipeline,0,57.758335,2.149689,57.758335,0.0,False,{'Baseline Regressor': {'strategy': 'mean'}}


pipeline = RegressionPipeline(component_graph={'Imputer': ['Imputer', 'X', 'y'], 'CatBoost Regressor': ['CatBoost Regressor', 'Imputer.x', 'y']}, parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'median', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'CatBoost Regressor':{'n_estimators': 74, 'eta': 0.05671392060446587, 'max_depth': 8, 'bootstrap_type': None, 'silent': False, 'allow_writing_files': False, 'n_jobs': -1}}, random_seed=0)

In [4]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])


*********************************
* CatBoost Regressor w/ Imputer *
*********************************

Problem Type: regression
Model Family: CatBoost

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : median
	 * categorical_fill_value : None
	 * numeric_fill_value : None
2. CatBoost Regressor
	 * n_estimators : 74
	 * eta : 0.05671392060446587
	 * max_depth : 8
	 * bootstrap_type : None
	 * silent : False
	 * allow_writing_files : False
	 * n_jobs : -1

Training
Training for regression problems.
Total training time (including CV): 13.2 seconds

Cross Validation
----------------
             Root Mean Squared Error      MSE    R2 # Training # Validation
0                             47.586 2264.474 0.373     40,403       20,202
1                             42.900 1840.441 0.411     40,403       20,202
2                             45.087 2032.851 0.380     40,404       20,201
mean                          45.191 2045.922 0.388    

In [5]:
automl.best_pipeline.score(X_val, y_val, objectives=["Root Mean Squared Error",'MSE', 'R2'])

OrderedDict([('Root Mean Squared Error', 46.10136623937205),
             ('MSE', 2125.3359691367127),
             ('R2', 0.41186190851869675)])

# After Building Model

In [6]:
test = pd.read_csv(TEST_PATH)
X_test = test.drop([ID],axis=1)

sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = automl.best_pipeline.predict(X_test)
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head(10)

Unnamed: 0,id,site_eui
0,75757,232.211534
1,75758,204.460317
2,75759,270.711677
3,75760,246.140194
4,75761,243.041124
5,75762,256.382284
6,75763,247.99478
7,75764,235.270684
8,75765,183.93807
9,75766,270.494812
