<h1>03 Error Estimation for Linear Regression and 3NN</h1>

In [44]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [45]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_error
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

<h1>Introductory Remarks</h1>
<ul>
    <li>We're going to predict the strength of concrete!</li>
    <li>We have a labeled dataset, originally described in<br />
        I-Cheng Yeh, "Modeling of strength of high performance concrete using artificial neural networks," Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998).
    </li>
    <li>Nowadays, it is available from the <a href="http://archive.ics.uci.edu/ml/index.php">UC Irvine Machine Learning Repository</a>. I have taken a copy and made it available to you as a CSV file called <code>dataset_concrete.csv</code>.
    </li>
    <li>Use error estimation to compare linear regression and 3NN.
    </li>
 </ul>

In [46]:
df = pd.read_csv("../datasets/dataset_concrete.csv")
df = df.sample(frac=1, random_state=2)
df.reset_index(drop=True, inplace=True)
features = ["cement", "slag", "fly_ash", "water","superplasticizer", "coarse_aggregate", "fine_aggregate", "age"]
X = df[features]
y = df["strength"].values

In [47]:
dev_df, test_df = train_test_split(df, train_size=0.8, random_state=2)

In [48]:
dev_X = dev_df[features]
test_X = test_df[features]

In [49]:
dev_y = dev_df["strength"].values
test_y = test_df["strength"].values

In [50]:
param_grid = {"predictor__n_neighbors" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [51]:
preprocessor = ColumnTransformer([
        ("scaler", StandardScaler(), features)], 
        remainder="passthrough")

In [52]:
knn_model = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", KNeighborsRegressor())])

In [53]:
gs = GridSearchCV(knn_model, param_grid, scoring="neg_mean_absolute_error", cv=10, refit=True)
gs.fit(dev_X, dev_y)

In [54]:
gs.best_params_, gs.best_score_

({'predictor__n_neighbors': 1}, -6.538776814575374)

In [55]:
mean_absolute_error(dev_y, gs.predict(dev_X))

0.06145631067961163

In [56]:
mean_absolute_error(test_y, gs.predict(test_X))

6.129368932038836

In [57]:
linear_model = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", LinearRegression())])
linear_model.fit(dev_X, dev_y)

In [58]:
mean_absolute_error(dev_y, linear_model.predict(dev_X))

8.097417726987596

In [59]:
mean_absolute_error(test_y, linear_model.predict(test_X))

8.407187476302653

In [60]:
gs.fit(X, y)
dump(gs, '../../models/concrete_model.pkl')

['../../models/concrete_model.pkl']

In [64]:
dev_X.iloc[0]

cement              166.0
slag                259.7
fly_ash               0.0
water               183.2
superplasticizer     12.7
coarse_aggregate    858.8
fine_aggregate      826.8
age                  28.0
Name: 128, dtype: float64

In [91]:
ejx = pd.DataFrame({"cement":[00.0], "slag":[0.0], "fly_ash":[5.0], "water": [1.0], "superplasticizer":[0.0], "coarse_aggregate":[0.0], "fine_aggregate":[2.2], "age":[12.0]})

In [92]:
gs.predict(ejx)

array([76.24])