In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from joblib import load, dump
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from preprocess import prep_data
import numpy as np
import pandas as pd

In [2]:
# load data and split into training and test sets
train_df= pd.read_csv("fish_participant.csv").set_index("Species")
test_df = pd.read_csv("fish_holdout_demo.csv").set_index("Species")

In [3]:
train_df.head()

Unnamed: 0_level_0,Weight,Length1,Length2,Length3,Height,Width
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bream,430.0,26.5,29.0,34.0,12.444,5.134
Perch,110.0,20.0,22.0,23.5,5.5225,3.995
Roach,160.0,20.5,22.5,25.3,7.0334,3.8203
Parkki,60.0,14.3,15.5,17.4,6.5772,2.3142
Bream,700.0,30.4,33.0,38.3,14.8604,5.2854


In [4]:
# identify target for train and test dataset

X_train, y_train = prep_data(train_df)
X_test, y_test = prep_data(test_df)

In [5]:
X_test

array([[19.    , 20.7   , 23.2   ,  8.5376,  3.2944],
       [18.4   , 20.    , 22.4   ,  8.8928,  3.2928],
       [11.4   , 12.    , 13.2   ,  2.2044,  1.1484],
       [35.5   , 38.    , 40.5   ,  7.29  ,  4.5765],
       [40.    , 42.5   , 45.5   ,  7.28  ,  4.3225],
       [22.    , 24.    , 27.2   ,  7.5344,  3.8352],
       [43.2   , 46.    , 48.7   ,  7.792 ,  4.87  ],
       [10.8   , 11.3   , 12.6   ,  1.9782,  1.2852],
       [29.5   , 32.    , 37.3   , 13.9129,  5.0728],
       [27.8   , 30.    , 31.6   ,  7.6156,  4.7716],
       [29.1   , 31.5   , 36.4   , 13.7592,  4.368 ],
       [22.6   , 24.6   , 26.2   ,  6.7334,  4.1658],
       [25.4   , 27.5   , 28.9   ,  7.1672,  4.335 ],
       [15.7   , 17.4   , 18.5   ,  4.588 ,  2.9415],
       [26.3   , 29.    , 33.5   , 12.73  ,  4.4555],
       [28.5   , 30.7   , 36.2   , 14.2266,  4.9594],
       [ 7.5   ,  8.4   ,  8.8   ,  2.112 ,  1.408 ],
       [30.4   , 33.    , 38.5   , 14.938 ,  5.1975],
       [19.    , 21.    , 22

In [6]:
# now we will construct and fit our model using gradient boost regression

greg = GradientBoostingRegressor(
    max_depth=5,
    n_estimators=400,
    learning_rate=0.1)

greg.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=5,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=400,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [7]:
# get the optimal number of trees using the staged_predict()
errors = [mean_squared_error(y_test, y_pred) for y_pred in greg.staged_predict(X_test)]
best_n_estimators = np.argmin(errors)

In [9]:
best_greg = GradientBoostingRegressor(
    max_depth=5,
    n_estimators=best_n_estimators,
    learning_rate=0.1
)
best_greg.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=5,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=399,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [10]:
y_pred = best_greg.predict(X_test)
mean_absolute_error(y_test, y_pred)

0.0023359830832468476

In [11]:
dump(greg, "model_joblib")

['model_joblib']

In [12]:
mj = load("model_joblib")

In [13]:
### Test predict.py ###

from preprocess import prep_data
df = pd.read_csv("fish_holdout_demo.csv")
X, y = prep_data(df)



In [14]:
X

array([[19.    , 20.7   , 23.2   ,  8.5376,  3.2944],
       [18.4   , 20.    , 22.4   ,  8.8928,  3.2928],
       [11.4   , 12.    , 13.2   ,  2.2044,  1.1484],
       [35.5   , 38.    , 40.5   ,  7.29  ,  4.5765],
       [40.    , 42.5   , 45.5   ,  7.28  ,  4.3225],
       [22.    , 24.    , 27.2   ,  7.5344,  3.8352],
       [43.2   , 46.    , 48.7   ,  7.792 ,  4.87  ],
       [10.8   , 11.3   , 12.6   ,  1.9782,  1.2852],
       [29.5   , 32.    , 37.3   , 13.9129,  5.0728],
       [27.8   , 30.    , 31.6   ,  7.6156,  4.7716],
       [29.1   , 31.5   , 36.4   , 13.7592,  4.368 ],
       [22.6   , 24.6   , 26.2   ,  6.7334,  4.1658],
       [25.4   , 27.5   , 28.9   ,  7.1672,  4.335 ],
       [15.7   , 17.4   , 18.5   ,  4.588 ,  2.9415],
       [26.3   , 29.    , 33.5   , 12.73  ,  4.4555],
       [28.5   , 30.7   , 36.2   , 14.2266,  4.9594],
       [ 7.5   ,  8.4   ,  8.8   ,  2.112 ,  1.408 ],
       [30.4   , 33.    , 38.5   , 14.938 ,  5.1975],
       [19.    , 21.    , 22

In [15]:
reg = load("model_joblib")

predictions = reg.predict(X)

In [16]:
print(predictions)

[ 139.99903923  149.99880899    9.80078304  429.99769052  456.00014869
  169.00123474  566.99763554    8.70169835  340.00475082  320.00445209
  499.99666566  187.99972764  260.00246356   69.99908793  363.00400411
  499.99429242    5.90107614  699.99862431  124.99656347   86.99871499
  272.00159798  999.99596139  170.00057849 1000.00289152  719.99558958
 1099.99855998   99.99682992  160.00289476]
