# **Storing Models with Automated Algorithm Selection with Model Persistence**

## Import, Dummy Code, Normalize, and Split

In [None]:
# STEP 1: Get the data, dummy code it, standardize it, divide and split

import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import preprocessing

df = pd.read_csv('https://www.ishelp.info/data/insurance.csv')
# Better -> pull from live DB: SELECT TOP(100000) FROM insurance ORDER BY date DESC

# Generate dummy codes
for col in df:
  if not pd.api.types.is_numeric_dtype(df[col]):
    df = df.join(pd.get_dummies(df[col], prefix=col, drop_first=True))

df = df.select_dtypes(np.number)  # Remove categorical features first
y = df.charges                    # Save the label first
X = df.drop(columns=['charges'])   # Remove the label from the feature list

# Scale/normalize the features
X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X), columns=X.columns)

# Split the data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=12345)

# Eyeball the data to make sure it looks right:
X_train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
1046,0.543478,0.245359,0.0,0.0,0.0,0.0,0.0,0.0
358,0.108696,0.698144,0.0,1.0,0.0,0.0,1.0,0.0
1144,0.695652,0.439602,0.4,1.0,0.0,0.0,0.0,1.0
522,0.717391,0.483051,0.0,0.0,0.0,0.0,0.0,0.0
54,0.478261,0.342480,0.6,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
546,0.217391,0.523944,0.0,1.0,0.0,0.0,0.0,0.0
382,0.804348,0.458434,0.0,1.0,0.0,0.0,1.0,0.0
129,0.434783,0.504170,0.4,1.0,0.0,0.0,0.0,1.0
1309,0.500000,0.436911,0.4,1.0,0.0,0.0,0.0,1.0


## Algorithm/Model Testing

In [None]:
# ALGORITHMS: See a complete list of regression algorithems in Sci-Kit Learn: https://scikit-learn.org/stable/supervised_learning.html

fit = {}  # Use this to store each of the fit metrics

# 1. LINEAR MODELS: assumes normal distribution, homoscedasticity, no multi-collinearity, independence, and no auto-correlation (some exceptions apply)
import sklearn.linear_model as lm

# 1.1. Ordinary Least Squares Multiple Linear Regression
model_ols = lm.LinearRegression()
model_ols.fit(X_train, y_train)
fit['OrdinaryLS R'] = model_ols.score(X_test, y_test)

# 1.2. Ridge Regression: more robust to multi-collinearity
model_rr = lm.Ridge(alpha=0.5) # adjust this alpha parameter for better results (between 0 and 1)
model_rr.fit(X_train, y_train)
fit['Ridge R'] = model_rr.score(X_test, y_test)

# 1.3. Lasso Regression: better for sparse values like RetweetCount where most are zeros but a few have many retweets.
model_lr = lm.Lasso(alpha=0.1) # adjust this alpha parameter for better results (between 0 and 1)
model_lr.fit(X_train, y_train)
fit['Lasso R'] = model_lr.score(X_test, y_test)

# 1.4. Least Angle Regression: good when the number of features is greater than the number of samples
model_llr = lm.LassoLars(alpha=0.1) # adjust this alpha parameter for better results (between 0 and 1)
model_llr.fit(X_train, y_train)
fit['LARS Lasso R'] = model_llr.score(X_test, y_test)

# 1.5. Bayesian Regression: probability based; allows regularization parameters, automatically tuned to data
model_br = lm.BayesianRidge()
model_br.fit(X_train, y_train)
fit['Bayesian R'] = model_br.score(X_test, y_test)

# These only work on sklearn 0.23 which is not available in Colab yet. You can use these on your Jupyter or VS Code versions
# # 1.6. Generalized Linear Regression (Poisson): Good when you don't have a normal distribution, count-based data, and a Poisson distribution
# model_pr = sklearn.linear_model.TweedieRegressor(power=1, link="log") # Power=1 means this is a Poisson
# model_pr.fit(X_train, y_train)
# fit['Poisson R'] = model_pr.score(X_test, y_test)

# # 1.7. Generalized Linear Regression (Gamma): Good when you don't have a normal distribution, continuous data, and a Gamma distribution
# model_gr = lm.TweedieRegressor(power=2, link="log") # Power=2 means this is a Gamma
# model_gr.fit(X_train, y_train)
# fit['Gamma R'] = model_gr.score(X_test, y_test)

# # 1.8. Generalized Linear Regression (Inverse Gamma): Good when you don't have a normal distribution, continuous data, and an inverse Gamma distribution
# model_igr = lm.TweedieRegressor(power=3) # Power=3 means this is an inverse Gamma
# model_igr.fit(X_train, y_train)
# fit['Inverse Gamma R'] = model_igr.score(X_test, y_test)



# SUPPORT VECTOR MACHINES
from sklearn import svm

# 1.9. SVM: this is the default SVM, parameters can be modified to make this more accurate
model_svm = svm.SVR()
model_svm.fit(X_train, y_train)
fit['SupportVM R'] = model_svm.score(X_test, y_test)

# 1.10. Linear SVM: Faster than SVM but only considers a linear model
model_lsvm = svm.LinearSVR()
model_lsvm.fit(X_train, y_train)
fit['Linear SVM R'] = model_lsvm.score(X_test, y_test)

# 1.11. NuSVM: 
model_nusvm = svm.NuSVR()
model_nusvm.fit(X_train, y_train)
fit['NuSupportVM R'] = model_nusvm.score(X_test, y_test)




# STOCHASTIC GRADIENT DESCENT REGRESSION
# 1.12. SGDRegressor: 
model_sgdr = lm.SGDRegressor()
model_sgdr.fit(X_train, y_train)
fit['SGradientD R'] = model_sgdr.score(X_test, y_test)




# KNN: NEAREST NEIGHBORS REGRESSION
from sklearn import neighbors

# 1.13. KNeighborsRegressor: 
model_knnr = neighbors.KNeighborsRegressor(5, 'uniform')
model_knnr.fit(X_train, y_train)
fit['KNNeighbors R'] = model_knnr.score(X_test, y_test)

# 1.14. KNeighborsRegressor: 
model_knnrd = neighbors.KNeighborsRegressor(8, 'distance')
model_knnrd.fit(X_train, y_train)
fit['KNNeighborsD R'] = model_knnrd.score(X_test, y_test)





# GAUSSIAN PROCESS REGRESSION
from sklearn import gaussian_process
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

# 1.15. GaussianProcessRegressor:
model_gpr = gaussian_process.GaussianProcessRegressor(DotProduct() + WhiteKernel())
model_gpr.fit(X_train, y_train)
fit['GaussianP R'] = model_gpr.score(X_test, y_test)





# DECISION TREE MODELS: no assumptions about the data
import sklearn.tree as tree
import sklearn.ensemble as se

# 1.16. Decision Tree Regression
model_dt = tree.DecisionTreeRegressor(random_state=12345)
model_dt.fit(X_train, y_train)
fit['Dec Tree R'] = model_dt.score(X_test, y_test)


# DECISION TREE-BASED ENSEMBLE MODELS: great for minimizing overfitting, these are based on averaging many unique sub-samples and combining algorithms 
# 1.17. Decision Forrest
model_df = se.RandomForestRegressor(random_state=12345)
model_df.fit(X_train, y_train)
fit['Dec Forest R'] = model_df.score(X_test, y_test)

# 1.18. ExtraTreesRegressor
model_etr = se.ExtraTreesRegressor(random_state=12345)
model_etr.fit(X_train, y_train)
fit['Extra Trees R'] = model_etr.score(X_test, y_test)

# 1.19. AdaBoostRegressor
model_abr = se.AdaBoostRegressor(n_estimators=100, random_state=12345)
model_abr.fit(X_train, y_train)
fit['AdaBoost DT R'] = model_abr.score(X_test, y_test)

# 1.20. GradientBoostingRegressor
model_gbr = se.GradientBoostingRegressor(random_state=12345)
model_gbr.fit(X_train, y_train)
fit['Grad. Boost R'] = model_gbr.score(X_test, y_test)

# Only available in sklearn 0.23
# # 1.21. HistGradientBoostingRegressor
# model_hgbr = se.HistGradientBoostingRegressor(random_state=12345)
# model_hgbr.fit(X_train, y_train)
# fit['HG Boost R'] = model_hgbr.score(X_test, y_test)

# 1.22. VotingRegressor: will combine other algorithms into an average; kind of cool
model_vr = se.VotingRegressor(estimators=[('DT', model_dt), ('DF', model_df), ('ETR', model_etr), ('ABR', model_abr), ('GBR', model_gbr)])
model_vr.fit(X_train, y_train)
fit['Voting R'] = model_vr.score(X_test, y_test)

# 1.23. StackingRegressor
from sklearn.linear_model import RidgeCV, LassoCV
estimators = [('ridge', RidgeCV()), ('lasso', LassoCV(random_state=42)), ('svr', svm.SVR(C=1, gamma=1e-6))]
model_sr = se.StackingRegressor(estimators=estimators, final_estimator=se.GradientBoostingRegressor(random_state=12345))
model_sr.fit(X_train, y_train)
fit['Stacking R'] = model_sr.score(X_test, y_test)
                                       




# NEURAL-NETWORK MODELS: Based on deep learning methods
import sklearn.neural_network as nn

# 1.24. MLPRegressor
model_nn = nn.MLPRegressor(max_iter=1000, random_state=12345) # Turn max_iter way up or down to get a more accurate result
model_nn.fit(X_train, y_train)
fit['NeuralNet R'] = model_nn.score(X_test, y_test)


# Sort and print the dictionary by greatest R squared to least
r2s = sorted_list_by_value=sorted(fit, key=fit.__getitem__, reverse=True)
for r2 in r2s:
  print(f'{r2}:\t{fit[r2]}')

Grad. Boost R:	0.8650458515184248
Voting R:	0.8519465154506137
AdaBoost DT R:	0.8388511111248289
Dec Forest R:	0.8385632868961774
Extra Trees R:	0.8279342724648359
KNNeighborsD R:	0.7855833812136057
KNNeighbors R:	0.768085491728174
Stacking R:	0.7627918843997207
Ridge R:	0.7538916925024396
Bayesian R:	0.7538302032093211
LARS Lasso R:	0.7537894054354863
Lasso R:	0.753693283731631
OrdinaryLS R:	0.753686352357909
SGradientD R:	0.7514811096049173
Dec Tree R:	0.721206134184744
NeuralNet R:	0.06454413627444944
GaussianP R:	0.0014095746981229729
NuSupportVM R:	-0.02645764932270156
SupportVM R:	-0.08468368331295606
Linear SVM R:	-0.8793004634454971




## Choose the Best Algorithm

In [None]:
# Select the model with the highest R squared
print(f'Best model: {r2s[0]} (R2:{fit[r2s[0]]})')
model = fit[r2s[1]]
type(model)

Best model: Grad. Boost R (R2:0.8650458515184248)


numpy.float64

## Save Best Model

In [None]:
import joblib
import pickle

# Save the model with the highest fit metric
pickle.dump(model, open('stored_model.sav', 'wb'))  # OPTION 1: pickle
joblib.dump(model, "stored_model.pkl")              # OPTION 2: joblib

['stored_model.pkl']

### Pickle Example

In [None]:
# ...some time later

import pickle
import numpy as np

# OPTION 1: Using pickle
# load the model from 'stored_model.sav'
loaded_model = pickle.load(open('stored_model.sav', 'rb'))
print(type(loaded_model))

# for a single prediction, enter a row of data and reshape into numpy array
case = [0.543478, 0.245359, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0]
print(f'Single prediction {case}: {loaded_model.predict(np.array(case).reshape(1, -1))[0]}\n')

# for a batch prediction, enter a Pandas DataFrame or a Numpy array of arrays
predictions = loaded_model.predict(X_test) 
batch_results = pd.DataFrame({'Actual':y_test, 'Predicted':predictions, 'Diff':(predictions - y_test)})
print(f'MAE:\t{batch_results.Diff.abs().mean()}\n')
batch_results.head(5)

<class 'numpy.float64'>


AttributeError: ignored

### Joblib Example

In [None]:
# OPTION 2: Using joblib
from sklearn.externals import joblib
classifer = joblib.load("stored_model.pkl")

# for a single prediction, enter a row of data and reshape into numpy array
case = [0.543478, 0.245359, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]
print(f'Single prediction {case}: {loaded_model.predict(np.array(case).reshape(1, -1))[0]}\n')

# for a batch prediction, enter a Pandas DataFrame or a Numpy array of arrays
predictions = loaded_model.predict(X_test) 
batch_results = pd.DataFrame({'Actual':y_test, 'Predicted':predictions, 'Diff':(predictions - y_test)})
print(f'MAE:\t{batch_results.Diff.abs().mean()}\n')
batch_results.tail(5)