# **Algorithm Selection**

## **Pipeline**

### **Functions**

In [None]:
def import_data(path, messages=True):
  import pandas as pd
  df = pd.read_csv(path)
  if messages: print(df.shape)
  return df

def bin_groups(df, features=[], cutoff=0.05, replace_with='Other', messages=True):
  import pandas as pd
  if len(features) == 0: features = df.columns
  for feat in features:
    if feat in df.columns:  # Make sure they don't accidentally enter a feature name that doesn't exist
      if not pd.api.types.is_numeric_dtype(df[feat]):
        other_list = df[feat].value_counts()[df[feat].value_counts() / df.shape[0] < cutoff].index
        if len(other_list) > 0:
            df.loc[df[feat].isin(other_list), feat] = replace_with
            if messages and len(other_list) > 0: print(f'{feat} has been binned by setting {other_list.values} to {replace_with}')
    else:
      if messages: print(f'{feat} not found in the DataFrame provided. No binning performed')
  return df

def missing_drop(df, label, row_thresh=0.7, col_thresh=0.9, drop_all=False):
  df.dropna(axis='rows', subset=[label], inplace=True)
  df.dropna(axis='columns', thresh=1, inplace=True)
  df.dropna(axis='rows', thresh=1, inplace=True)
  df.dropna(axis='columns', thresh=round(df.shape[0] * row_thresh), inplace=True)
  df.dropna(axis='rows', thresh=round(df.shape[1] * col_thresh), inplace=True)
  if drop_all: df.dropna(axis='rows', inplace=True)
  return df

def Xandy(df, label):
  import pandas as pd
  y = df[label]
  X = df.drop(columns=[label])
  return X, y

def dummy_code(X):
  import pandas as pd
  X = pd.get_dummies(X, drop_first=True)
  return X

def minmax(X):
  import pandas as pd
  from sklearn.preprocessing import MinMaxScaler
  X = pd.DataFrame(MinMaxScaler().fit_transform(X.copy()), columns=X.columns, index=X.index)
  return X

def impute_KNN(df, label, neighbors=5):
  from sklearn.impute import KNNImputer
  import pandas as pd
  X, y = Xandy(df, label)
  X = dummy_code(X.copy())
  X = minmax(X.copy())
  imp = KNNImputer(n_neighbors=neighbors, weights="uniform")
  X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index)
  return X.merge(y, left_index=True, right_index=True)

def fit_cv_regression(df, k, label, repeat=True, algorithm='ensemble', random_state=1, messages=True):
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean
  X, y = Xandy(df, label)
  X = dummy_code(X)
  if repeat:  cv = RepeatedKFold(n_splits=k, n_repeats=5, random_state=random_state)
  else:       cv = KFold(n_splits=k, random_state=random_state, shuffle=True)
  if algorithm == 'linear':
    from sklearn.linear_model import Ridge, LassoLars
    model1 = Ridge(random_state=random_state)
    model2 = LassoLars(random_state=random_state)
    score1 = mean(cross_val_score(model1, X, y, scoring='r2', cv=cv, n_jobs=-1))
    score2 = mean(cross_val_score(model2, X, y, scoring='r2', cv=cv, n_jobs=-1))
  elif algorithm == 'ensemble':
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    model1 = RandomForestRegressor(random_state=random_state)
    model2 = GradientBoostingRegressor(random_state=random_state)
    score1 = mean(cross_val_score(model1, X, y, scoring='r2', cv=cv, n_jobs=-1))
    score2 = mean(cross_val_score(model2, X, y, scoring='r2', cv=cv, n_jobs=-1))
  else:
    from sklearn.neural_network import MLPRegressor
    from sklearn.neighbors import KNeighborsRegressor
    model1 = MLPRegressor(random_state=random_state, max_iter=10000)
    model2 = KNeighborsRegressor()
    score1 = mean(cross_val_score(model1, X, y, scoring='r2', cv=cv, n_jobs=-1))
    score2 = mean(cross_val_score(model2, X, y, scoring='r2', cv=cv, n_jobs=-1))
  if messages:
    print('R2', '{: <25}'.format(type(model1).__name__), round(score1, 4))
    print('R2', '{: <25}'.format(type(model2).__name__), round(score2, 4))
  if score1 > score2: return model1.fit(X, y)
  else:               return model2.fit(X, y)

def select_features(df, label, model, max='auto'):
  from sklearn.feature_selection import SelectFromModel
  import pandas as pd
  X, y = Xandy(df, label)
  if max != 'auto':
    sel = SelectFromModel(model, prefit=True, max_features=round(max*df.drop(columns=[label]).shape[1]))
  else:
    sel = SelectFromModel(model, prefit=True)
  sel.transform(X)
  columns = list(X.columns[sel.get_support()])
  columns.append(label)
  return df[columns]

def dump_pickle(model, file_name):
  import pickle
  pickle.dump(model, open(file_name, "wb"))

def load_pickle(file_name):
  import pickle
  model = pickle.load(open(file_name, "rb"))
  return model

### **Choreography**

In [None]:
# Don't forget to mount Google Drive if you need it:
from google.colab import drive
drive.mount('/content/drive')

# Setting the label here since it is used in multiple function calls
label = 'SalePrice'

# Import the data
df = import_data('/content/drive/MyDrive/Colab Notebooks/data/housing.csv', messages=False)

# Clean/prepare the data
df = bin_groups(df, messages=False)
df = missing_drop(df, label)
df = impute_KNN(df, label)

# Select features and store a trained model
model = fit_cv_regression(df, k=10, label=label) # We have to begin with a trained model
df_reduced = select_features(df.copy(), label, model) # Use that model to select features
model = fit_cv_regression(df_reduced, k=10, label=label)  # Retrain the model with the smaller feature set

# Deployment pipeline
dump_pickle(model, 'saved_model.sav')

Mounted at /content/drive
R2 RandomForestRegressor     0.86
R2 GradientBoostingRegressor 0.8808




R2 RandomForestRegressor     0.8576
R2 GradientBoostingRegressor 0.8653


## **Regression Algorithms**

### **Linear Models**
The family of algorithms that use linear quadratic functions to reduce the complexity of a dataset into a set of estimated values (i.e. coeffcients)


In [None]:
# Suppress scientific notation in pandas
import pandas as pd
pd.options.display.float_format = '{:.6f}'.format

fit = {}         # Use this to store each of the fit metrics
models = {}      # Use this to store each of the models
random_state = 1 # Updates all models and cross-validators at once

# 1. LINEAR MODELS: assumes normal distribution, homoscedasticity, no multi-collinearity, independence, and no auto-correlation (some exceptions apply; some of these algorithms are better at handling violations of these assumptions)
import sklearn.linear_model as lm, pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from numpy import mean

# Let's use the reduced feature set we established in our pipeline
X, y = Xandy(df_reduced, label)

# Set up a standard cross_validation object to use for each algorithm
cv = KFold(n_splits=5, random_state=random_state, shuffle=True)

# 1.1. Ordinary Least Squares Multiple Linear Regression
model_ols = lm.LinearRegression()
fit['OLS'] = mean(cross_val_score(model_ols, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['OLS'] = model_ols

# 1.2. Ridge Regression: more robust to multi-collinearity
model_rr = lm.Ridge(alpha=0.5, random_state=random_state) # adjust this alpha parameter for better results (between 0 and 1)
fit['Ridge'] = mean(cross_val_score(model_rr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Ridge'] = model_rr

# 1.3. Lasso Regression: better for sparse values like RetweetCount where most are zeros but a few have many retweets.
model_lr = lm.Lasso(alpha=0.1, random_state=random_state) # adjust this alpha parameter for better results (between 0 and 1)
fit['Lasso'] = mean(cross_val_score(model_lr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Lasso'] = model_lr

# 1.4. Least Angle Regression: good when the number of features is greater than the number of samples
model_llr = lm.LassoLars(alpha=0.1, random_state=random_state) # adjust this alpha parameter for better results (between 0 and 1)
fit['LARS'] = mean(cross_val_score(model_llr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['LARS'] = model_llr

# 1.5. Bayesian Regression: probability based; allows regularization parameters, automatically tuned to data
model_br = lm.BayesianRidge()
fit['Bayesian'] = mean(cross_val_score(model_br, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Bayesian'] = model_br

# 1.6. Generalized Linear Regression (Poisson): Good for non-normal distribution, count-based data, and a Poisson distribution
model_pr = lm.TweedieRegressor(power=1, link="log") # Power=1 means this is a Poisson
fit['Poisson'] = mean(cross_val_score(model_pr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Poisson'] = model_pr

# 1.7. Generalized Linear Regression (Gamma): Good for non-normal distribution, continuous data, and a Gamma distribution
model_gr = lm.TweedieRegressor(power=2, link="log") # Power=2 means this is a Gamma
fit['Gamma'] = mean(cross_val_score(model_gr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Gamma'] = model_gr

# 1.8. Generalized Linear Regression (Inverse Gamma): Good non-normal distribution, continuous data, and an inverse Gamma distribution
model_igr = lm.TweedieRegressor(power=3) # Power=3 means this is an inverse Gamma
fit['Inverse'] = mean(cross_val_score(model_igr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Inverse'] = model_igr


# ----------------------------------------------------------------------------------------------------
# Sort and print the dictionary by greatest R squared to least
df_fit = pd.DataFrame({'R-squared':fit})
df_fit.sort_values(by=['R-squared'], ascending=False)

Unnamed: 0,R-squared
Ridge,0.769304
Bayesian,0.767651
Lasso,0.767024
LARS,0.767021
OLS,0.767018
Poisson,0.34479
Gamma,0.16708
Inverse,-0.006506


### **Support Vector Machines**

In [None]:
# SUPPORT VECTOR MACHINES: Ideal for noisy data with large gaps among values
from sklearn import svm

# 1.9. SVM: this is the default SVM, parameters can be modified to make this more accurate
model_svm = svm.SVR()
fit['SupportVM'] = mean(cross_val_score(model_svm, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['SupportVM'] = model_svm

# 1.10. Linear SVM: Faster than SVM but only considers a linear model
model_lsvm = svm.LinearSVR(random_state=random_state)
fit['Linear SVM'] = mean(cross_val_score(model_lsvm, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Linear SVM'] = model_lsvm

# 1.11. NuSVM:
model_nusvm = svm.NuSVR()
fit['NuSupportVM'] = mean(cross_val_score(model_nusvm, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['NuSupportVM'] = model_nusvm

# ----------------------------------------------------------------------------------------------------
# Sort and print the dictionary by greatest R squared to least
df_fit = pd.DataFrame({'R-squared':fit})
df_fit.sort_values(by=['R-squared'], ascending=False)

Unnamed: 0,R-squared
Ridge,0.769304
Bayesian,0.767651
Lasso,0.767024
LARS,0.767021
OLS,0.767018
Poisson,0.34479
Gamma,0.16708
Inverse,-0.006506
NuSupportVM,-0.014332
SupportVM,-0.050286


### **KNN: Nearest Neighbors Regression**

In [None]:
# KNN: NEAREST NEIGHBORS REGRESSION
from sklearn import neighbors

# 1.12. KNeighborsRegressor:
model_knnr = neighbors.KNeighborsRegressor(n_neighbors=10, weights='uniform')
fit['KNNeighbors'] = mean(cross_val_score(model_knnr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['KNNeighbors'] = model_knnr

# 1.13. KNeighborsRegressor:
model_knnrd = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
fit['KNNeighborsD'] = mean(cross_val_score(model_knnrd, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['KNNeighborsD'] = model_knnrd

# ----------------------------------------------------------------------------------------------------
# Sort and print the dictionary by greatest R squared to least
df_fit = pd.DataFrame({'R-squared':fit})
df_fit.sort_values(by=['R-squared'], ascending=False)

Unnamed: 0,R-squared
KNNeighborsD,0.818097
KNNeighbors,0.806524
Ridge,0.769304
Bayesian,0.767651
Lasso,0.767024
LARS,0.767021
OLS,0.767018
Poisson,0.34479
Gamma,0.16708
Inverse,-0.006506


### **Gaussian Process Regression**

In [None]:
# GAUSSIAN PROCESS REGRESSION
from sklearn import gaussian_process
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

# 1.14. GaussianProcessRegressor:
model_gpr = gaussian_process.GaussianProcessRegressor(DotProduct() + WhiteKernel(), random_state=random_state)
fit['GaussianP'] = mean(cross_val_score(model_gpr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['GaussianP'] = model_gpr

# ----------------------------------------------------------------------------------------------------
# Sort and print the dictionary by greatest R squared to least
df_fit = pd.DataFrame({'R-squared':fit})
df_fit.sort_values(by=['R-squared'], ascending=False)

Unnamed: 0,R-squared
KNNeighborsD,0.818097
KNNeighbors,0.806524
Ridge,0.769304
Bayesian,0.767651
Lasso,0.767024
LARS,0.767021
OLS,0.767018
Poisson,0.34479
Gamma,0.16708
GaussianP,-0.003952


### **Decision Tree and Ensemble Regressors**

In [None]:
# DECISION TREE MODELS: no assumptions about the data
import sklearn.tree as tree
import sklearn.ensemble as se

# 1.15. Decision Tree Regression
model_dt = tree.DecisionTreeRegressor(random_state=random_state)
fit['Dec Tree'] = mean(cross_val_score(model_dt, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Dec Tree'] = model_dt


# DECISION TREE-BASED ENSEMBLE MODELS: great for minimizing overfitting, these are based on averaging many unique sub-samples and combining algorithms
# 1.16. Decision Forrest
model_df = se.RandomForestRegressor(random_state=random_state)
fit['Dec Forest'] = mean(cross_val_score(model_df, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Dec Forest'] = model_df

# 1.17. ExtraTreesRegressor
model_etr = se.ExtraTreesRegressor(random_state=random_state)
fit['Extra Trees'] = mean(cross_val_score(model_etr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Extra Trees'] = model_etr

# 1.18. AdaBoostRegressor
model_abr = se.AdaBoostRegressor(n_estimators=100, random_state=random_state)
fit['AdaBoost DT'] = mean(cross_val_score(model_abr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['AdaBoost DT'] = model_abr

# 1.19. GradientBoostingRegressor
model_gbr = se.GradientBoostingRegressor(random_state=random_state)
fit['Grad. Boost'] = mean(cross_val_score(model_gbr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Grad. Boost'] = model_gbr

# 1.20. HistGradientBoostingRegressor
model_hgbr = se.HistGradientBoostingRegressor(random_state=random_state)
fit['HG Boost'] = mean(cross_val_score(model_hgbr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['HG Boost'] = model_hgbr

# 1.21. VotingRegressor: will combine other algorithms into an average; kind of cool
model_vr = se.VotingRegressor(estimators=[('DT', model_dt), ('DF', model_df), ('ETR', model_etr), ('ABR', model_abr), ('GBR', model_gbr)])
fit['Voting'] = mean(cross_val_score(model_vr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Voting'] = model_vr

# 1.22. StackingRegressor
from sklearn.linear_model import RidgeCV, LassoCV
estimators = [('ridge', RidgeCV()), ('lasso', LassoCV(random_state=42)), ('svr', svm.SVR(C=1, gamma=1e-6))]
model_sr = se.StackingRegressor(estimators=estimators, final_estimator=se.GradientBoostingRegressor(random_state=random_state))
fit['Stacking'] = mean(cross_val_score(model_sr, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['Stacking'] = model_sr

# ----------------------------------------------------------------------------------------------------
# Sort and print the dictionary by greatest R squared to least
df_fit = pd.DataFrame({'R-squared':fit})
df_fit.sort_values(by=['R-squared'], ascending=False)

Unnamed: 0,R-squared
Grad. Boost,0.878953
Extra Trees,0.869922
Voting,0.864471
Dec Forest,0.863743
HG Boost,0.856875
KNNeighborsD,0.818097
AdaBoost DT,0.806571
KNNeighbors,0.806524
Ridge,0.769304
Bayesian,0.767651


### **XGBoost**
A different package built on top of sklearn with a better algorithm for gradient boosted decision trees

In [None]:
from xgboost import XGBRegressor

# 1.23. XGBRegressor
model_xgb = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8, random_state=random_state)
fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['XGBoost'] = model_xgb

# ----------------------------------------------------------------------------------------------------
# Sort and print the dictionary by greatest R squared to least
df_fit = pd.DataFrame({'R-squared':fit})
df_fit.sort_values(by=['R-squared'], ascending=False)

Unnamed: 0,R-squared
Grad. Boost,0.878953
Extra Trees,0.869922
XGBoost,0.869801
Voting,0.864471
Dec Forest,0.863743
HG Boost,0.856875
KNNeighborsD,0.818097
AdaBoost DT,0.806571
KNNeighbors,0.806524
Ridge,0.769304


### **Neural Network Regressor**

In [None]:
# NEURAL-NETWORK MODELS: Based on deep learning methods
import sklearn.neural_network as nn

# 1.23. MLPRegressor
model_nn = nn.MLPRegressor(max_iter=1000, random_state=random_state) # Turn max_iter way up or down to get a more accurate result
fit['NeuralNet'] = mean(cross_val_score(model_nn, X, y, scoring='r2', cv=cv, n_jobs=-1))
models['NeuralNet'] = model_nn

# ----------------------------------------------------------------------------------------------------
# Sort and print the dictionary by greatest R squared to least
df_fit = pd.DataFrame({'R-squared':fit})
df_fit.sort_values(by=['R-squared'], ascending=False)

Unnamed: 0,R-squared
Grad. Boost,0.878953
Extra Trees,0.869922
XGBoost,0.869801
Voting,0.864471
Dec Forest,0.863743
HG Boost,0.856875
KNNeighborsD,0.818097
AdaBoost DT,0.806571
KNNeighbors,0.806524
Ridge,0.769304


### **Automating Algorithm Selection**

In [None]:
def fit_cv_regression_expanded(df, label, k=10, r=5, repeat=True, random_state=1):
  import sklearn.linear_model as lm, pandas as pd, sklearn.ensemble as se
  import sklearn.neural_network as nn
  import sklearn.neighbors as neighbors
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  from numpy import mean, std
  from sklearn import svm
  from sklearn import gaussian_process
  from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
  from xgboost import XGBRegressor

  X, y = Xandy(df, label)

  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=r, random_state=random_state)
  else:
    cv = KFold(n_splits=k, random_state=random_state, shuffle=True)

  fit = {}    # Use this to store each of the fit metrics
  models = {} # Use this to store each of the models

  # Create the model objects
  model_ols = lm.LinearRegression()
  model_rr = lm.Ridge(alpha=0.5, random_state=random_state) # adjust this alpha parameter for better results (between 0 and 1)
  model_lr = lm.Lasso(alpha=0.1, random_state=random_state) # adjust this alpha parameter for better results (between 0 and 1)
  model_llr = lm.LassoLars(alpha=0.1, random_state=random_state) # adjust this alpha parameter for better results (between 0 and 1)
  model_br = lm.BayesianRidge()
  model_pr = lm.TweedieRegressor(power=1, link="log") # Power=1 means this is a Poisson
  model_gr = lm.TweedieRegressor(power=2, link="log") # Power=2 means this is a Gamma
  model_igr = lm.TweedieRegressor(power=3) # Power=3 means this is an inverse Gamma
  model_svm = svm.SVR()
  model_lsvm = svm.LinearSVR(random_state=random_state)
  model_nusvm = svm.NuSVR()
  model_knnr = neighbors.KNeighborsRegressor(n_neighbors=10, weights='uniform')
  model_knnrd = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
  model_gpr = gaussian_process.GaussianProcessRegressor(DotProduct() + WhiteKernel(), random_state=random_state)
  model_df = se.RandomForestRegressor(random_state=random_state)
  model_etr = se.ExtraTreesRegressor(random_state=random_state)
  model_abr = se.AdaBoostRegressor(n_estimators=100, random_state=random_state)
  model_gbr = se.GradientBoostingRegressor(random_state=random_state)
  model_hgbr = se.HistGradientBoostingRegressor(random_state=random_state)
  model_vr = se.VotingRegressor(estimators=[('DF', model_df), ('ETR', model_etr), ('ABR', model_abr), ('GBR', model_gbr)])
  estimators = [('ridge', lm.RidgeCV()), ('lasso', lm.LassoCV(random_state=42)), ('svr', svm.SVR(C=1, gamma=1e-6))]
  model_sr = se.StackingRegressor(estimators=estimators, final_estimator=se.GradientBoostingRegressor(random_state=random_state))
  model_xgb = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8, random_state=random_state)
  model_nn = nn.MLPRegressor(max_iter=1000, random_state=random_state)

  # Fit a crss-validated R squared score and add it to the dict
  fit['OLS'] = mean(cross_val_score(model_ols, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Ridge'] = mean(cross_val_score(model_rr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Lasso'] = mean(cross_val_score(model_lr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['LARS'] = mean(cross_val_score(model_llr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Bayesian'] = mean(cross_val_score(model_br, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Poisson'] = mean(cross_val_score(model_pr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Gamma'] = mean(cross_val_score(model_gr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Inverse'] = mean(cross_val_score(model_igr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['SupportVM'] = mean(cross_val_score(model_svm, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Linear SVM'] = mean(cross_val_score(model_lsvm, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['NuSupportVM'] = mean(cross_val_score(model_nusvm, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['KNNeighbors'] = mean(cross_val_score(model_knnr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['KNNeighborsD'] = mean(cross_val_score(model_knnrd, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['GaussianP'] = mean(cross_val_score(model_gpr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Dec Forest'] = mean(cross_val_score(model_df, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Extra Trees'] = mean(cross_val_score(model_etr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['AdaBoost DT'] = mean(cross_val_score(model_abr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Grad. Boost'] = mean(cross_val_score(model_gbr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['HG Boost'] = mean(cross_val_score(model_hgbr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Voting'] = mean(cross_val_score(model_vr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Stacking'] = mean(cross_val_score(model_sr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['NeuralNet'] = mean(cross_val_score(model_nn, X, y, scoring='r2', cv=cv, n_jobs=-1))

  # Add the model to another dict; make sure the keys have the same names as the list above
  models['OLS'] = model_ols
  models['Ridge'] = model_rr
  models['Lasso'] = model_lr
  models['LARS'] = model_llr
  models['Bayesian'] = model_br
  models['Poisson'] = model_pr
  models['Gamma'] = model_gr
  models['Inverse'] = model_igr
  models['SupportVM'] = model_svm
  models['Linear SVM'] = model_lsvm
  models['NuSupportVM'] = model_nusvm
  models['KNNeighbors'] = model_knnr
  models['KNNeighborsD'] = model_knnrd
  models['GaussianP'] = model_gpr
  models['Dec Forest'] = model_df
  models['Extra Trees'] = model_etr
  models['AdaBoost DT'] = model_abr
  models['Grad. Boost'] = model_gbr
  models['HG Boost'] = model_hgbr
  models['Voting'] = model_vr
  models['Stacking'] = model_sr
  models['XGBoost'] = model_xgb
  models['NeuralNet'] = model_nn

  # Add the fit dictionary to a new DataFrame, sort, extract the top row, use it to retrieve the model object from the models dictionary
  df_fit = pd.DataFrame({'R-squared':fit})
  df_fit.sort_values(by=['R-squared'], ascending=False, inplace=True)
  best_model = df_fit.index[0]
  print(df_fit)

  return models[best_model].fit(X, y)

In [None]:
# Don't forget to mount Google Drive if you need it:
# from google.colab import drive
# drive.mount('/content/drive')

# Setting the label here since it is used in multiple function calls
label = 'SalePrice'

# Import the data
df = import_data('/content/drive/MyDrive/Colab Notebooks/data/housing.csv', messages=False)

# Clean/prepare the data
df = bin_groups(df, messages=False)
df = missing_drop(df, label)
df = impute_KNN(df, label)

# Select features and store a trained model
model = fit_cv_regression(df, 10, label, messages=False) # We have to begin with a trained model
df_reduced = select_features(df.copy(), label, model) # Use that model to select features
model = fit_cv_regression_expanded(df_reduced, label, k=10, r=5)  # Retrain the model with the smaller feature set

# Deployment pipeline
dump_pickle(model, 'saved_model_1.sav')



              R-squared
Voting         0.865459
Grad. Boost    0.865341
XGBoost        0.861658
Extra Trees    0.860726
Dec Forest     0.857579
HG Boost       0.847380
KNNeighborsD   0.813139
AdaBoost DT    0.806821
KNNeighbors    0.798821
Ridge          0.767650
Bayesian       0.765822
Lasso          0.765009
LARS           0.765007
OLS            0.765001
Poisson        0.310829
Gamma          0.166642
GaussianP     -0.005630
Inverse       -0.008529
NuSupportVM   -0.016630
SupportVM     -0.052784
Stacking      -0.073539
NeuralNet     -4.179506
Linear SVM    -5.281764


In [None]:
# Later when a page loads that needs the predicted value(s):
import pandas as pd
model = load_pickle('saved_model_1.sav')
df_predictions = pd.DataFrame({'Actual SalePrice':df_reduced.SalePrice,
                               'Predicted SalePrice':model.predict(df_reduced.drop(columns=['SalePrice']))})
df_predictions['Difference'] = df_predictions['Actual SalePrice'] - df_predictions['Predicted SalePrice']
print(df_predictions.tail())

      Actual SalePrice  Predicted SalePrice   Difference
1455            582933        532461.158691 50471.841309
1456            611657        572376.620922 39280.379078
1457            625000        606220.655697 18779.344303
1458            745000        729313.110401 15686.889599
1459            755000        729556.327137 25443.672863


## **Classification Algorithms**



### **Automting Classification Algorithm Selection**

In [None]:
def fit_cv_classification_expanded(df, label, k=10, r=5, repeat=True, random_state=1):
  import sklearn.linear_model as lm, pandas as pd, sklearn.ensemble as se, numpy as np
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  from numpy import mean, std
  from sklearn import svm
  from sklearn import gaussian_process
  from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
  from sklearn import svm
  from sklearn.naive_bayes import CategoricalNB
  from xgboost import XGBClassifier
  from sklearn import preprocessing
  from sklearn.neural_network import MLPClassifier

  X, y = Xandy(df, label)

  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=r, random_state=random_state)
  else:
    cv = KFold(n_splits=k, random_state=random_state, shuffle=True)

  fit = {}    # Use this to store each of the fit metrics
  models = {} # Use this to store each of the models

  # Create the model objects
  model_log = lm.LogisticRegression(max_iter=100)
  model_logcv = lm.RidgeClassifier()
  model_sgd = lm.SGDClassifier(max_iter=1000, tol=1e-3)
  model_pa = lm.PassiveAggressiveClassifier(max_iter=1000, random_state=random_state, tol=1e-3)
  model_per = lm.Perceptron(fit_intercept=False, max_iter=10, tol=None, shuffle=False)
  model_knn = KNeighborsClassifier(n_neighbors=3)
  model_svm = svm.SVC(decision_function_shape='ovo') # Remove the parameter for two-class model
  model_nb = CategoricalNB()
  model_bag = se.BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
  model_ada = se.AdaBoostClassifier(n_estimators=100, random_state=random_state)
  model_ext = se.ExtraTreesClassifier(n_estimators=100, random_state=random_state)
  model_rf = se.RandomForestClassifier(n_estimators=10)
  model_hgb = se.HistGradientBoostingClassifier(max_iter=100)
  model_vot = se.VotingClassifier(estimators=[('lr', model_log), ('rf', model_ext), ('gnb', model_hgb)], voting='hard')
  model_gb = se.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
  estimators = [('ridge', lm.RidgeCV()), ('lasso', lm.LassoCV(random_state=random_state)), ('knr', KNeighborsRegressor(n_neighbors=20, metric='euclidean'))]
  final_estimator = se.GradientBoostingRegressor(n_estimators=25, subsample=0.5, min_samples_leaf=25, max_features=1, random_state=random_state)
  model_st = se.StackingRegressor(estimators=estimators, final_estimator=final_estimator)
  model_nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=random_state)
  model_xgb = XGBClassifier()

  # Fit a crss-validated R squared score and add it to the dict
  fit['Logistic'] = mean(cross_val_score(model_log, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['Ridge'] = mean(cross_val_score(model_logcv, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['SGD'] = mean(cross_val_score(model_sgd, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['PassiveAggressive'] = mean(cross_val_score(model_pa, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['Perceptron'] = mean(cross_val_score(model_per, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['KNN'] = mean(cross_val_score(model_knn, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['SVM'] = mean(cross_val_score(model_svm, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['NaiveBayes'] = mean(cross_val_score(model_nb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['Bagging'] = mean(cross_val_score(model_bag, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['AdaBoost'] = mean(cross_val_score(model_ada, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['ExtraTrees'] = mean(cross_val_score(model_ext, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['RandomForest'] = mean(cross_val_score(model_rf, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['HistGradient'] = mean(cross_val_score(model_hgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['Voting'] = mean(cross_val_score(model_vot, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['GradBoost'] = mean(cross_val_score(model_gb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['NeuralN'] = mean(cross_val_score(model_nn, X, y, scoring='accuracy', cv=cv, n_jobs=-1))

  # XGBoost needs to LabelEncode the y before fitting the model
  from sklearn.preprocessing import LabelEncoder
  le = LabelEncoder().fit(y)
  y_encoded = le.transform(y)
  fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y_encoded, scoring='accuracy', cv=cv, n_jobs=-1))

  # Add the model to another dict; make sure the keys have the same names as the list above
  models['Logistic'] = model_log
  models['Ridge'] = model_logcv
  models['SGD'] = model_sgd
  models['PassiveAggressive'] = model_pa
  models['Perceptron'] = model_per
  models['KNN'] = model_knn
  models['SVM'] = model_svm
  models['NaiveBayes'] = model_nb
  models['Bagging'] = model_bag
  models['AdaBoost'] = model_ada
  models['ExtraTrees'] = model_ext
  models['RandomForest'] = model_rf
  models['HistGradient'] = model_hgb
  models['Voting'] = model_vot
  models['GradBoost'] = model_gb
  models['XGBoost'] = model_xgb
  models['NeuralN'] = model_nn

  # Add the fit dictionary to a new DataFrame, sort, extract the top row, use it to retrieve the model object from the models dictionary
  df_fit = pd.DataFrame({'Accuracy':fit})
  df_fit.sort_values(by=['Accuracy'], ascending=False, inplace=True)
  best_model = df_fit.index[0]
  print(df_fit)

  return models[best_model].fit(X, y)

In [None]:
def fit_cv_classification(df, k, label, repeat=True, algorithm='ensemble', random_state=1, messages=True):
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean
  X, y = Xandy(df, label)
  X = dummy_code(X)
  if repeat:  cv = RepeatedKFold(n_splits=k, n_repeats=5, random_state=12345)
  else:       cv = KFold(n_splits=k, random_state=12345, shuffle=True)
  if algorithm == 'linear':
    from sklearn.linear_model import RidgeClassifier, SGDClassifier
    model1 = RidgeClassifier(random_state=random_state)
    model2 = SGDClassifier(random_state=random_state)
    score1 = mean(cross_val_score(model1, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    score2 = mean(cross_val_score(model2, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  elif algorithm == 'ensemble':
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    model1 = RandomForestClassifier(random_state=random_state)
    model2 = GradientBoostingClassifier(random_state=random_state)
    score1 = mean(cross_val_score(model1, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    score2 = mean(cross_val_score(model2, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  else:
    from sklearn.neural_network import MLPClassifier
    from sklearn.neighbors import KNeighborsClassifier
    model1 = MLPClassifier(random_state=random_state, max_iter=10000)
    model2 = KNeighborsClassifier()
    score1 = mean(cross_val_score(model1, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    score2 = mean(cross_val_score(model2, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  if messages:
    print('Accuracy', '{: <25}'.format(type(model1).__name__), round(score1, 4))
    print('Accuracy', '{: <25}'.format(type(model2).__name__), round(score2, 4))
  if score1 > score2: return model1.fit(X, y)
  else:               return model2.fit(X, y)

In [None]:
# Don't forget to mount Google Drive if you need it:
# from google.colab import drive
# drive.mount('/content/drive')

# Setting the label here since it is used in multiple function calls
label = 'SaleCondition'

# Import the data
df = import_data('/content/drive/MyDrive/Colab Notebooks/data/housing.csv', messages=False)

# Clean/prepare the data
df = bin_groups(df, messages=False)
df = missing_drop(df, label)
df = impute_KNN(df, label)

# Select features and store a trained model
model = fit_cv_classification(df, 10, label, messages=False) # We have to begin with a trained model
df_reduced = select_features(df.copy(), label, model) # Use that model to select features
model = fit_cv_classification_expanded(df_reduced, label, k=10, r=5)  # Retrain the model with the smaller feature set

# Deployment pipeline
dump_pickle(model, 'saved_model_clf.sav')

Accuracy RandomForestClassifier    0.899
Accuracy GradientBoostingClassifier 0.8935




                   Accuracy
ExtraTrees         0.902400
Voting             0.901982
SGD                0.900331
HistGradient       0.899498
Logistic           0.898956
SVM                0.898818
Ridge              0.898131
Perceptron         0.896480
XGBoost            0.895781
RandomForest       0.893303
KNN                0.892082
Bagging            0.891810
GradBoost          0.883391
NeuralN            0.878711
PassiveAggressive  0.873460
AdaBoost           0.729001
NaiveBayes              NaN


In [None]:
# Later when a page loads that needs the predicted value(s):
model = load_pickle('saved_model_clf.sav')

pd.DataFrame({'Actual':df_reduced.SaleCondition, 'Predicted':model.predict(df_reduced.drop(columns=['SaleCondition']))}).head(10)

Unnamed: 0,Actual,Predicted
0,Abnorml,Abnorml
1,Abnorml,Abnorml
2,Abnorml,Abnorml
4,Normal,Normal
5,Normal,Normal
6,Normal,Normal
8,Normal,Normal
9,Other,Other
10,Normal,Normal
11,Abnorml,Abnorml


## **Hyperparameter Tuning**

### **Exhaustive Grid Search**

In [None]:
# Don't forget to mount Google Drive if you need it:
# from google.colab import drive
# drive.mount('/content/drive')

# Set constants
label = 'SalePrice'
random_state = 1

# Import the data; BUT!!! Make sure to randomly select 100% of the data because it is
# sorted by the label SalePrice and that will create a problem for hyperparameter tuning
df = import_data('/content/drive/MyDrive/Colab Notebooks/data/housing.csv', messages=False)
df = df.sample(frac=1, random_state=random_state)

# Clean/prepare the data
df = bin_groups(df, messages=False)
df = missing_drop(df, label)
df = impute_KNN(df, label)

# Train an XGBRegressor model with no parameters set for comparison
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

# Train a model to use for selecting features
X, y = Xandy(df, label)
model = XGBRegressor(random_state=random_state).fit(X, y)

# Use that model to select features
df = select_features(df, label, model)

# Retrain the model with the smaller feature set
model = XGBRegressor(random_state=random_state).fit(X, y)

# Create a cv object to calculate a cross-validated R2 score
cv = KFold(n_splits=3, random_state=random_state, shuffle=True)
X, y = Xandy(df, label)

print(f'Baseline R2 for XGBRegressor model:\t{mean(cross_val_score(model, X, y, scoring="r2", cv=cv, n_jobs=-1))}')



Baseline R2 for XGBRegressor model:	0.8505237500149239


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV

# Create the parameter grid of all values you want to try
params = {
    "booster": ['gbtree', 'gblinear', 'dart'],
    "learning_rate": [0.1, 0.3, 0.5],  # It accepts float [0,1] specifying learning rate for training process. Default = 0.3
    "objective": ['reg:squarederror'], # List of possible values: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
}

# Create the hypertuning search object
model_xgb = GridSearchCV(
    XGBRegressor(random_state=random_state),
    params,
    n_jobs=-1, # Number of threads to use; -1 means use all available
    scoring='r2', # Options: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=KFold(n_splits=3), # Choose any type of cross_validation you want
    verbose=2, # How much information to display in the results; options: 1, 2, or 3
    refit=True # This saves the best-fitting model
    )

model_xgb.fit(X, y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [None]:
print(f'Best parameters: {model_xgb.best_params_}')
print(f'R-squared:\t {model_xgb.best_score_}')

print(f'All results:')
pd.DataFrame(model_xgb.cv_results_)

Best parameters: {'booster': 'dart', 'learning_rate': 0.1, 'objective': 'reg:squarederror'}
R-squared:	 0.8593379017213255
All results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_learning_rate,param_objective,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.170787,0.011267,0.007396,0.001251,gbtree,0.1,reg:squarederror,"{'booster': 'gbtree', 'learning_rate': 0.1, 'o...",0.815737,0.901438,0.860839,0.859338,0.035003,2
1,0.162098,0.004892,0.006285,7.5e-05,gbtree,0.3,reg:squarederror,"{'booster': 'gbtree', 'learning_rate': 0.3, 'o...",0.813186,0.882158,0.854374,0.849906,0.028334,4
2,0.158149,0.008141,0.007231,0.001161,gbtree,0.5,reg:squarederror,"{'booster': 'gbtree', 'learning_rate': 0.5, 'o...",0.788691,0.845923,0.839406,0.824673,0.025582,5
3,0.030184,0.001524,0.005341,0.000156,gblinear,0.1,reg:squarederror,"{'booster': 'gblinear', 'learning_rate': 0.1, ...",0.731187,0.79638,0.711443,0.746337,0.036292,9
4,0.029032,0.000299,0.005341,0.000163,gblinear,0.3,reg:squarederror,"{'booster': 'gblinear', 'learning_rate': 0.3, ...",0.75866,0.82686,0.726109,0.770543,0.041981,8
5,0.031486,0.002854,0.004969,0.000221,gblinear,0.5,reg:squarederror,"{'booster': 'gblinear', 'learning_rate': 0.5, ...",0.764514,0.834896,0.732604,0.777338,0.042734,7
6,2.057824,0.387065,0.01525,0.002793,dart,0.1,reg:squarederror,"{'booster': 'dart', 'learning_rate': 0.1, 'obj...",0.815737,0.901438,0.860839,0.859338,0.035003,1
7,1.619661,0.153585,0.013345,0.001421,dart,0.3,reg:squarederror,"{'booster': 'dart', 'learning_rate': 0.3, 'obj...",0.813186,0.882158,0.854374,0.849906,0.028334,3
8,1.409113,0.343132,0.012034,0.003003,dart,0.5,reg:squarederror,"{'booster': 'dart', 'learning_rate': 0.5, 'obj...",0.788691,0.845923,0.839406,0.824673,0.025582,6


In [None]:
print(f'Best parameters: {model_xgb.best_params_}')
print(f'R-squared:\t {model_xgb.best_score_}')

print(f'All results:')
pd.DataFrame({
    "Parameters":model_xgb.cv_results_['params'],
    "Mean Fit Score":model_xgb.cv_results_['mean_test_score'],
    "Std Fit Score":model_xgb.cv_results_['std_test_score']
})

Best parameters: {'booster': 'dart', 'learning_rate': 0.1, 'objective': 'reg:squarederror'}
R-squared:	 0.8593379017213255
All results:


Unnamed: 0,Parameters,Mean Fit Score,Std Fit Score
0,"{'booster': 'gbtree', 'learning_rate': 0.1, 'o...",0.859338,0.035003
1,"{'booster': 'gbtree', 'learning_rate': 0.3, 'o...",0.849906,0.028334
2,"{'booster': 'gbtree', 'learning_rate': 0.5, 'o...",0.824673,0.025582
3,"{'booster': 'gblinear', 'learning_rate': 0.1, ...",0.746337,0.036292
4,"{'booster': 'gblinear', 'learning_rate': 0.3, ...",0.770543,0.041981
5,"{'booster': 'gblinear', 'learning_rate': 0.5, ...",0.777338,0.042734
6,"{'booster': 'dart', 'learning_rate': 0.1, 'obj...",0.859338,0.035003
7,"{'booster': 'dart', 'learning_rate': 0.3, 'obj...",0.849906,0.028334
8,"{'booster': 'dart', 'learning_rate': 0.5, 'obj...",0.824673,0.025582


In [None]:
# This is the best fitted model from the GridSearchCV object
final_model = model_xgb.best_estimator_

# Save and deploy it
dump_pickle(final_model, "best_model.sav")

# Predict against it
final_model.predict(df.drop(columns=['SalePrice']))

array([117881.72, 135363.11, 115138.47, ..., 235056.66, 105682.09,
       203976.1 ], dtype=float32)

In [None]:
params = {
    "booster": ['gbtree'], # Default is gbtree
    "learning_rate": [0.1],  # It accepts float [0,1] specifying learning rate for training process. Default = 0.3
    "objective": ['reg:squarederror'], # List of possible values: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
    "max_depth": [3, 4, 5, 6], # Must be between 3-10; default = 6
    "min_child_weight": [1, 2, 3], # Default = 1
}

# Create the hypertuning object
model_xgb = GridSearchCV(
    XGBRegressor(random_state=random_state),
    params,
    n_jobs=-1, # Number of threads to use; -1 means use all available
    scoring='r2', # Options: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=KFold(n_splits=3), # Choose any type of cross_validation you want
    verbose=2, # How much information to display in the results; options: 1, 2, or 3
    refit=True # This saves the best-fitting model
    )

model_xgb.fit(df.drop(columns=['SalePrice']), df.SalePrice)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [None]:
print(f'Best parameters: {model_xgb.best_params_}')
print(f'R-squared:\t {model_xgb.best_score_}')

print(f'All results:')
pd.DataFrame({
    "Parameters":model_xgb.cv_results_['params'],
    "Mean Fit Score":model_xgb.cv_results_['mean_test_score'],
    "Std Fit Score":model_xgb.cv_results_['std_test_score']
})

Best parameters: {'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 2, 'objective': 'reg:squarederror'}
R-squared:	 0.8739662176020139
All results:


Unnamed: 0,Parameters,Mean Fit Score,Std Fit Score
0,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.86096,0.03692
1,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.873966,0.026031
2,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.870878,0.02866
3,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.861078,0.03462
4,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.865094,0.033787
5,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.866328,0.031954
6,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.858903,0.034761
7,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.859179,0.036492
8,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.867124,0.035132
9,"{'booster': 'gbtree', 'learning_rate': 0.1, 'm...",0.859338,0.035003


In [None]:
params = {
    "booster": ['gbtree'], # Default is gbtree
    "learning_rate": [0.1],  # It accepts float [0,1] specifying learning rate for training process. Default = 0.3
    "objective": ['reg:squarederror'], # List of possible values: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
    "max_depth": [3], # Must be between 3-10; default = 6
    "min_child_weight": [1], # Default = 1
    "gamma": [0, 0.1, 0.2], # Default = 0
    "subsample": [0.8, 0.9, 1], # Default = 1
    "colsample_bytree": [0.8, 1], # Default = 1
    "alpha": [0, .001, 1, 100], # Default = 0
}

# Create the hypertuning object
model_xgb = GridSearchCV(
    XGBRegressor(),
    params,
    n_jobs=-1, # Number of threads to use; -1 means use all available
    scoring='r2', # Options: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=KFold(n_splits=3), # Choose any type of cross_validation you want
    verbose=2, # How much information to display in the results; options: 1, 2, or 3
    refit=True # This saves the best-fitting model
    )

model_xgb.fit(df.drop(columns=['SalePrice']), df.SalePrice)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [None]:
print(f'Best parameters: {model_xgb.best_params_}')
print(f'R-squared:\t {model_xgb.best_score_}')

print(f'All results:')
pd.DataFrame({
    "Parameters":model_xgb.cv_results_['params'],
    "Mean Fit Score":model_xgb.cv_results_['mean_test_score'],
    "Std Fit Score":model_xgb.cv_results_['std_test_score']
})

Best parameters: {'alpha': 100, 'booster': 'gbtree', 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'objective': 'reg:squarederror', 'subsample': 1}
R-squared:	 0.8713994540939543
All results:


Unnamed: 0,Parameters,Mean Fit Score,Std Fit Score
0,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.854198,0.053478
1,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.864717,0.041183
2,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.869561,0.031670
3,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.854198,0.053478
4,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.864717,0.041183
...,...,...,...
67,"{'alpha': 100, 'booster': 'gbtree', 'colsample...",0.866338,0.037119
68,"{'alpha': 100, 'booster': 'gbtree', 'colsample...",0.862920,0.038053
69,"{'alpha': 100, 'booster': 'gbtree', 'colsample...",0.854097,0.047728
70,"{'alpha': 100, 'booster': 'gbtree', 'colsample...",0.866338,0.037119


In [None]:
params = {
    "booster": ['gbtree', 'gblinear', 'dart'], # Default is gbtree
    "learning_rate": [0.1, 0.3, 0.5],  # It accepts float [0,1] specifying learning rate for training process. Default = 0.3
    "objective": ['reg:squarederror'], # List of possible values: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
    "max_depth": [3, 6, 9], # Must be between 3-10; default = 6
    "min_child_weight": [1, 2, 3], # Default = 1
    "gamma": [0, 0.1, 0.2], # Default = 0
    "subsample": [0.8, 0.9, 1], # Default = 1
    "colsample_bytree": [0.8, 1], # Default = 1
    "alpha": [0, .001, 1, 100], # Default = 0
}

# Create the hypertuning object
model_xgb = GridSearchCV(
    XGBRegressor(),
    params,
    n_jobs=-1, # Number of threads to use; -1 means use all available
    scoring='r2', # Options: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=KFold(n_splits=3), # Choose any type of cross_validation you want
    verbose=2, # How much information to display in the results; options: 1, 2, or 3
    refit=True # This saves the best-fitting model
    )

model_xgb.fit(df.drop(columns=['SalePrice']), df.SalePrice)

# This will take too long; once you've started running the cell, just click the stop button and then proceed with the next section

Fitting 3 folds for each of 5832 candidates, totalling 17496 fits


In [None]:
# Run this if you ever wait long enough for 17496 models to fit to see the R squared

print(f'Best parameters: {model_xgb.best_params_}')
print(f'R-squared:\t {model_xgb.best_score_}')

print(f'All results:')
pd.DataFrame({
    "Parameters":model_xgb.cv_results_['params'],
    "Mean Fit Score":model_xgb.cv_results_['mean_test_score'],
    "Std Fit Score":model_xgb.cv_results_['std_test_score']
})

Best parameters: {'alpha': 100, 'booster': 'gbtree', 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 2, 'objective': 'reg:squarederror', 'subsample': 0.9}
R-squared:	 0.8785719506584279
All results:


Unnamed: 0,Parameters,Mean Fit Score,Std Fit Score
0,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.854198,0.053478
1,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.864717,0.041183
2,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.869561,0.031670
3,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.872853,0.025769
4,"{'alpha': 0, 'booster': 'gbtree', 'colsample_b...",0.874996,0.025978
...,...,...,...
5827,"{'alpha': 100, 'booster': 'dart', 'colsample_b...",0.822969,0.034785
5828,"{'alpha': 100, 'booster': 'dart', 'colsample_b...",0.841092,0.019261
5829,"{'alpha': 100, 'booster': 'dart', 'colsample_b...",0.836421,0.023560
5830,"{'alpha': 100, 'booster': 'dart', 'colsample_b...",0.820181,0.009449


### **Randomized Parameter Observation**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    "booster": ['gbtree', 'gblinear', 'dart'], # Default is gbtree
    "learning_rate": [0.1, 0.3, 0.5],  # It accepts float [0,1] specifying learning rate for training process. Default = 0.3
    "objective": ['reg:squarederror'], # List of possible values: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
    "max_depth": [3, 6, 9], # Must be between 3-10; default = 6
    "min_child_weight": [1, 2, 3], # Default = 1
    "gamma": [0, 0.1, 0.2], # Default = 0
    "subsample": [0.8, 0.9, 1], # Default = 1
    "colsample_bytree": [0.8, 1], # Default = 1
    "alpha": [0, .001, 1, 100], # Default = 0
}

# Create the hypertuning object
model_xgb = RandomizedSearchCV(
    XGBRegressor(),
    params,
    n_iter=10, # Number of random samples to fit; default is 10
    n_jobs=-1, # Number of threads to use; -1 means use all available
    scoring='r2', # Options: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=KFold(n_splits=3), # Choose any type of cross_validation you want
    verbose=2, # How much information to display in the results; options: 1, 2, or 3
    refit=True # This saves the best-fitting model
    )

model_xgb.fit(df.drop(columns=['SalePrice']), df.SalePrice)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
print(f'Best parameters: {model_xgb.best_params_}')
print(f'R-squared:\t {model_xgb.best_score_}')

print(f'All results:')
pd.DataFrame({
    "Parameters":model_xgb.cv_results_['params'],
    "Mean Fit Score":model_xgb.cv_results_['mean_test_score'],
    "Std Fit Score":model_xgb.cv_results_['std_test_score']
})

Best parameters: {'subsample': 0.8, 'objective': 'reg:squarederror', 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 1, 'booster': 'dart', 'alpha': 100}
R-squared:	 0.8642761678542096
All results:


Unnamed: 0,Parameters,Mean Fit Score,Std Fit Score
0,"{'subsample': 0.9, 'objective': 'reg:squareder...",0.770557,0.041933
1,"{'subsample': 0.9, 'objective': 'reg:squareder...",0.828189,0.033104
2,"{'subsample': 1, 'objective': 'reg:squarederro...",0.845052,0.035185
3,"{'subsample': 1, 'objective': 'reg:squarederro...",0.771202,0.037445
4,"{'subsample': 0.9, 'objective': 'reg:squareder...",0.746324,0.036259
5,"{'subsample': 0.8, 'objective': 'reg:squareder...",0.771202,0.037445
6,"{'subsample': 1, 'objective': 'reg:squarederro...",0.842761,0.023925
7,"{'subsample': 1, 'objective': 'reg:squarederro...",0.77981,0.038208
8,"{'subsample': 0.9, 'objective': 'reg:squareder...",0.847794,0.032451
9,"{'subsample': 0.8, 'objective': 'reg:squareder...",0.864276,0.02664


### **Successive Halving**

In [None]:
from sklearn.experimental import enable_halving_search_cv # Must import this first
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV

params = {
    "booster": ['gbtree', 'gblinear', 'dart'], # Default is gbtree
    "learning_rate": [0.1, 0.3, 0.5],  # It accepts float [0,1] specifying learning rate for training process. Default = 0.3
    "objective": ['reg:squarederror'], # List of possible values: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
    "max_depth": [3, 6, 9], # Must be between 3-10; default = 6
    "min_child_weight": [1, 2, 3], # Default = 1
    "gamma": [0, 0.1, 0.2], # Default = 0
    "subsample": [0.8, 0.9, 1], # Default = 1
    "colsample_bytree": [0.8, 1], # Default = 1
    "alpha": [0, .001, 1, 100], # Default = 0
}

# Create the hypertuning object
model_xgb = HalvingRandomSearchCV( # If this takes to long, change it to HalvingRandomSearchCV
    XGBRegressor(),
    params,
    factor=2, # The 'halving' parameter; proportion of candidates selected for each iteration
    n_candidates=32, # The number of hyperparameter value sets to randomly sample
    resource='n_estimators', # Default = n_samples, but use n_estimators for boosting algorithms
    n_jobs=-1, # Number of threads to use; -1 means use all available
    scoring='r2', # Options: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=KFold(n_splits=3), # Choose any type of cross_validation you want
    verbose=2, # How much information to display in the results; options: 1, 2, or 3
    max_resources=800, # The maximum number of resources (either n_samples or n_estimators) to use in each round
    min_resources=50, # The maximum number of resources (either n_samples or n_estimators) to use in each round
    refit=True # This saves the best-fitting model
    )

model_xgb.fit(df.drop(columns=['SalePrice']), df.SalePrice)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 50
max_resources_: 800
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 32
n_resources: 50
Fitting 3 folds for each of 32 candidates, totalling 96 fits
----------
iter: 1
n_candidates: 16
n_resources: 100
Fitting 3 folds for each of 16 candidates, totalling 48 fits
----------
iter: 2
n_candidates: 8
n_resources: 200
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 3
n_candidates: 4
n_resources: 400
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 4
n_candidates: 2
n_resources: 800
Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [None]:
print(f'Best parameters: {model_xgb.best_params_}')
print(f'R-squared:\t {model_xgb.best_score_}')

print(f'All results:')
pd.DataFrame({
    "Parameters":model_xgb.cv_results_['params'],
    "Mean Fit Score":model_xgb.cv_results_['mean_test_score'],
    "Std Fit Score":model_xgb.cv_results_['std_test_score']
})

Best parameters: {'subsample': 0.9, 'objective': 'reg:squarederror', 'min_child_weight': 2, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 1, 'booster': 'dart', 'alpha': 0, 'n_estimators': 800}
R-squared:	 0.865573317164686
All results:


Unnamed: 0,Parameters,Mean Fit Score,Std Fit Score
0,"{'subsample': 1, 'objective': 'reg:squarederro...",0.756878,0.035711
1,"{'subsample': 0.8, 'objective': 'reg:squareder...",0.766969,0.042220
2,"{'subsample': 1, 'objective': 'reg:squarederro...",0.849509,0.028246
3,"{'subsample': 0.8, 'objective': 'reg:squareder...",0.766969,0.042221
4,"{'subsample': 1, 'objective': 'reg:squarederro...",0.848945,0.036547
...,...,...,...
57,"{'subsample': 0.8, 'objective': 'reg:squareder...",0.863114,0.022744
58,"{'subsample': 0.8, 'objective': 'reg:squareder...",0.863563,0.020235
59,"{'subsample': 0.9, 'objective': 'reg:squareder...",0.870789,0.024871
60,"{'subsample': 0.8, 'objective': 'reg:squareder...",0.857481,0.019797
