In [234]:
import pandas as pd
import numpy as np

from pandas.plotting import scatter_matrix
from seaborn import scatterplot, heatmap

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

from sklearn.metrics import mean_absolute_error

from joblib import dump

import os
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
  base_dir = "." 

In [235]:
df = pd.read_csv(os.path.join(base_dir, "../datasets/dataset_concrete.csv"))

#See our dataset size
df.describe()

Unnamed: 0,cement,slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [236]:
# Split off the test set: 20% of the dataset.
dev_df, test_df = train_test_split(df, train_size=0.8, random_state=2)
# Extract the features but leave as a DataFrame
features = ["cement", "slag", "fly_ash", "water", "superplasticizer", "coarse_aggregate", "fine_aggregate", "age"]
dev_X = dev_df[features]
test_X = test_df[features]

# Target values, converted to a 1D numpy array
dev_y = dev_df["strength"].values
test_y = test_df["strength"].values

In [237]:
# Create the object that shuffles and splits the dev data
# Why 0.75? Because 0.75 of 80% of the data is 60% of the original dataset.
ss = ShuffleSplit(n_splits=1, train_size=0.75, random_state=2)
# Create a preprocessor
preprocessor = ColumnTransformer([
        ("scaler", StandardScaler(), features)], 
        remainder="drop")


In [238]:
#Linear Model
linear_model = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", LinearRegression())])

In [239]:
# Create a pipeline that combines the preprocessor with 1NN
knn_model = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", KNeighborsRegressor(n_neighbors=1))])

knn_param = {"predictor__n_neighbors" : [1,2,3,4,5]}

# Error estimation for k=1
#np.mean(cross_val_score(knn_model, dev_X, dev_y, scoring="neg_mean_absolute_error", cv=10))

In [240]:
knn_gs = GridSearchCV(knn_model, knn_param, scoring="neg_mean_absolute_error", cv=10, refit=True)
knn_gs.fit(dev_X, dev_y)
knn_gs.best_params_, knn_gs.best_score_

({'predictor__n_neighbors': 3}, np.float64(-6.946695611715154))

In [241]:


linear_model.fit(dev_X, dev_y)

# Error estimation on the test set.
mean_absolute_error(test_y, linear_model.predict(test_X))

8.226419967037105

In [242]:
mean_absolute_error(test_y, knn_gs.predict(test_X))

6.449967637540453

In [243]:
knn_gs.fit(df[features], df["strength"].values)

0,1,2
,estimator,Pipeline(step...eighbors=1))])
,param_grid,"{'predictor__n_neighbors': [1, 2, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('scaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,1
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


<h2> Lab 4</h2>

In [244]:
dev_df

Unnamed: 0,cement,slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
128,401.8,94.7,0.0,147.4,11.4,946.8,852.1,28,68.50
365,214.9,53.8,121.9,155.6,9.6,1014.3,780.6,14,38.60
480,446.0,24.0,79.0,162.0,11.6,967.0,712.0,7,39.30
814,310.0,0.0,0.0,192.0,0.0,970.0,850.0,360,38.11
169,425.0,106.3,0.0,153.5,16.5,852.1,887.1,91,65.20
...,...,...,...,...,...,...,...,...,...
360,218.2,54.6,123.8,140.8,11.9,1075.7,792.7,14,35.96
466,190.3,0.0,125.2,166.6,9.9,1079.0,798.9,100,33.56
299,290.4,0.0,96.2,168.1,9.4,961.2,865.0,3,22.50
493,387.0,20.0,94.0,157.0,11.6,938.0,845.0,7,41.67


In [245]:
#Copy data frame
copy_df = dev_df.copy()
copy_df["wcr"] = copy_df["water"] / copy_df["cement"]


In [246]:
class InsertWCR(BaseEstimator, TransformerMixin):

    def __init__(self, insert=True):
        self.insert = insert
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.insert:
            X["wcr"] = X["water"] / (X["cement"])
            X["aggregate_total"] = X["coarse_aggregate"] + X["fine_aggregate"]
            
            # If the new feature is intended to replace the existing ones, 
            # you could drop the existing ones here
            # X.drop(["flarea", "bthrms", "bdrms"], axis=1)
    
            #X = X.replace( [ np.inf, -np.inf ], np.nan )
        return X
        

In [247]:
preprocessor = ColumnTransformer([
    ("features", Pipeline([("wcr", InsertWCR()),
    ("scaler", StandardScaler())]),
features)],
remainder="drop")

In [248]:
# Extract the features but leave as a DataFrame
dev_X = dev_df[features]
test_X = test_df[features]

# Target values, converted to a 1D numpy array
dev_y = dev_df["strength"].values
test_y = test_df["strength"].values

In [249]:
# Create a pipeline that combines the preprocessor with kNN
ols = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", LinearRegression())])

# Create a dictionary of hyperparameters for kNN
ols_param_grid = {"preprocessor__features__wcr__insert": [True, False]}

# Create the grid search object which will find the best hyperparameter values based on validation error
ols_gs = GridSearchCV(ols, ols_param_grid, scoring="neg_mean_absolute_error", cv=10, refit=True)

# Run grid search by calling fit. It will also re-train on train+validation using the best parameters.
ols_gs.fit(dev_X, dev_y)

# Let's see how well we did
ols_gs.best_params_, knn_gs.best_score_

({'preprocessor__features__wcr__insert': True}, np.float64(-8.366951456310677))