In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler
import matplotlib.pyplot as plt

### Train, save, confirm predictions and some timing
In this notebook we will
* Train the model (**make sure the model is trained with python 3.6**)
* Save the model (in pickle format)
* Confirm the prediction distributions
* Generate some rough batch timings for invocation

In [None]:
# load the data set
d = datasets.fetch_california_housing()

In [None]:
print(d.DESCR)

In [None]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2


def test_train_split(df, class_name):
    # create test/train split
    (test_s, train_s) = train_test_split(df, test_size=0.2, random_state=42)

    # get class column
    train_x = train_s.drop([class_name], axis=1)
    test_x = test_s.drop([class_name], axis=1)
    train_y = train_s[[class_name]]
    test_y = test_s[[class_name]]

    return train_x, test_x, train_y, test_y


def train(train_x, train_y):
    # clf = LinearRegression() # r^2 ~ 0.6
    clf = RandomForestRegressor(n_estimators=100) # r^2 ~ 0.73
    clf.fit(train_x, train_y)
    return clf


def train_pred_and_eval(train_x, test_x, train_y, test_y):
    # train
    clf = train(train_x, train_y)
    # predict
    pred = clf.predict(test_x)
    # mse
    mse = mean_squared_error(test_y, pred)
    # eval metrics
    rmse, mae, r2 = eval_metrics(test_y, pred)
    return clf, mse, rmse, mae, r2

In [None]:
# combine the data into a single data frame
df = pd.DataFrame(d.data, columns=d.feature_names)
new_feature_names = d.feature_names
new_feature_names.append('MedHouseVal')
df['MedHouseVal'] = d.target

# make a copy of the original
df_orig = df.copy()

# normalize the data
df = pd.DataFrame(MinMaxScaler().fit_transform(X=df), columns=new_feature_names)

# shuffle
df = df.sample(frac=1)

In [None]:
import warnings
warnings.filterwarnings('ignore')

train_x, test_x, train_y, test_y = test_train_split(df, 'MedHouseVal')
(clf, mse, rmse, mae, r2) = train_pred_and_eval(train_x, test_x, train_y, test_y)
print("data (original), MSE: %.2f, RMSE: %.2f, MAE: %.2f, r^2: %.2f" % (mse, rmse, mae, r2))

# so ~75% of the observed variation in median house value for a given area is explained by the predictors

In [None]:
# save the model
model_file = 'example_model.pkl'
pickle.dump(clf, open(model_file, 'wb'))

In [None]:
# stats about the target variable distribution
(df_orig['MedHouseVal']*100_000).describe()

In [None]:
# load the model and test
m = pickle.load(open(model_file, 'rb'))

# get a scalar for our median house value
med_house_val_scaler = MinMaxScaler().fit(df_orig['MedHouseVal'].values.reshape(-1, 1))
print(med_house_val_scaler)

# make a prediction and output it in dollars of 100,000
m.predict(test_x.sample())
#med_house_val_scaler.inverse_transform([m.predict(test_x.sample())])*100_000

In [None]:
plt.hist(df_orig['MedHouseVal'])

In [None]:
plt.hist(med_house_val_scaler.inverse_transform(m.predict(test_x).reshape(-1,1)))

In [None]:
import timeit

s = """
m.predict(s_features)
"""

times = [None] * 4
c = 0
for iters in [1,10,1000,10000]:
    x = []
    y = []
    for batchsize in [10,100,1000]:
        time_elapsed = 0
        count = 0
        while count <= iters:
            count += batchsize
            s_features = test_x.sample(batchsize).values.tolist()
            time_elapsed += timeit.timeit(s, 'from __main__ import ' + ', '.join(globals()), number=1)
        if batchsize >= 1000:
            print('iters=',iters,'\tbatchsize=',batchsize,'\ttime_elapsed=',time_elapsed)
        else:
            print('iters=',iters,'\tbatchsize=',batchsize,'\t\ttime_elapsed=',time_elapsed)
        x.append(batchsize)
        y.append(time_elapsed)
    times[c] = [x,y]
    c += 1

In [None]:
plt.ylim(0,.8)
plt.plot(times[0][0], times[0][1], 'r', times[1][0], times[1][1], 'b', times[2][0], times[2][1], 'g', times[3][0], times[3][1], 'y')
plt.xlabel('batch size')
plt.ylabel('time elapsed (seconds)')
plt.title('model invocation times')
plt.legend(('1 iteration', '10 iterations', '1000 iterations', '10000 iterations'), loc='upper right')
plt.show()

In [None]:
times2 = [None] * 5
c = 0
for iters in [1,10,1000,10000,50000]:
    x = []
    y = []
    for batchsize in [100,500,1000,2500,5000,7500,10000]:
        time_elapsed = 0
        count = 0
        while count <= iters:
            count += batchsize
            s_features = test_x.sample(batchsize).values.tolist()
            time_elapsed += timeit.timeit(s, 'from __main__ import ' + ', '.join(globals()), number=1)
        if batchsize >= 1000:
            print('iters=',iters,'\tbatchsize=',batchsize,'\ttime_elapsed=',time_elapsed)
        else:
            print('iters=',iters,'\tbatchsize=',batchsize,'\t\ttime_elapsed=',time_elapsed)
        x.append(batchsize)
        y.append(time_elapsed)
    times2[c] = [x,y]
    c += 1

In [None]:
plt.figure(figsize=(8,6))
plt.ylim(0,1.5)
plt.plot(times2[0][0], times2[0][1], 'r', times2[1][0], times2[1][1], 'b', times2[2][0], times2[2][1], 'g', times2[3][0], times2[3][1], 'y', times2[4][0], times2[4][1], 'b')
plt.xlabel('batch size')
plt.ylabel('time elapsed (seconds)')
plt.title('model invocation times')
plt.legend(('1 iteration', '10 iterations', '1000 iterations', '10000 iterations', '50000 iterations'), loc='upper right')
plt.axhline(y=1, color='r', linestyle='--')
plt.show()

In [None]:
#f = open('samples.10', 'w')
#f.write(str(test_x.sample(10).values.reshape(1,-1).tolist()[0]))
#f.close()