# Notebook_12: Multi Out Regression

The idea of this notebook is to experiment with multi-output regression to see if I can predict both the location and the depth of the crack and with what accuracy

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, RidgeCV, Ridge
from sklearn.multioutput import MultiOutputRegressor


from src.models.model_metrics import score_model

In [2]:
import altair as alt
from altair_saver import save
import altair_data_server
alt.data_transformers.enable('data_server')

DataTransformerRegistry.enable('data_server')

In [3]:
project_root = Path().resolve().parent
data_path = project_root / 'Data' / 'Final' / 'al_data_final.csv'

In [4]:
from src.config import FIGURES

In [5]:
data = pd.read_csv(data_path)

In [6]:
data

Unnamed: 0,x,temp,nf_hz,amp_mm,tc_act
0,5,22,40.970000,12.010000,0.00
1,5,22,38.460000,12.290000,0.50
2,5,22,38.360000,10.470000,0.51
3,5,22,38.380000,9.880000,0.52
4,5,22,37.480877,12.786667,0.75
...,...,...,...,...,...
175,25,200,32.472500,17.736250,1.50
176,25,200,30.274583,18.053958,1.75
177,25,200,28.076667,18.371667,2.00
178,25,200,25.878750,18.689375,2.25


## Data Prep

For the sake of a quick experiment, let's stick to the polynomialising that we've done throughout and seemed to show the best results predicting the 1 target

In [7]:
# Split into target and features
X = data.drop(['tc_act', 'x'], axis = 1)
y = data[['tc_act', 'x']]

In [8]:
X

Unnamed: 0,temp,nf_hz,amp_mm
0,22,40.970000,12.010000
1,22,38.460000,12.290000
2,22,38.360000,10.470000
3,22,38.380000,9.880000
4,22,37.480877,12.786667
...,...,...,...
175,200,32.472500,17.736250
176,200,30.274583,18.053958
177,200,28.076667,18.371667
178,200,25.878750,18.689375


In [9]:
y

Unnamed: 0,tc_act,x
0,0.00,5
1,0.50,5
2,0.51,5
3,0.52,5
4,0.75,5
...,...,...
175,1.50,25
176,1.75,25
177,2.00,25
178,2.25,25


In [10]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, stratify = data['x'])

In [11]:
X_train

Unnamed: 0,temp,nf_hz,amp_mm
171,200,36.810000,13.240000
134,150,37.530000,11.560000
158,200,35.680000,15.210000
150,200,31.920833,18.212500
127,150,32.667059,17.291765
...,...,...,...
143,150,25.376667,17.897800
53,50,36.338904,13.558356
175,200,32.472500,17.736250
164,200,29.385000,18.385000


In [12]:
X_test

Unnamed: 0,temp,nf_hz,amp_mm
48,50,40.45,12.88
94,100,26.479474,17.924211
162,200,32.395167,17.302833
68,50,33.165333,15.248909
156,200,37.93,15.88
52,50,37.109452,12.339178
154,200,22.319231,19.562885
85,100,37.71,13.9
32,22,33.856087,13.46413
78,100,33.684063,16.3405


In [13]:
y_train

Unnamed: 0,tc_act,x
171,0.52,25
134,0.51,25
158,0.51,15
150,1.25,5
127,1.50,15
...,...,...
143,2.50,25
53,1.00,15
175,1.50,25
164,1.75,15


In [14]:
y_test

Unnamed: 0,tc_act,x
48,0.0,15
94,2.25,15
162,1.25,15
68,1.75,25
156,0.0,15
52,0.75,15
154,2.25,5
85,0.5,15
32,1.75,25
78,1.25,5


In [15]:
prep_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

In [16]:
X_train_prepared = prep_pipeline.fit_transform(X_train)
X_test_prepared = prep_pipeline.transform(X_test)

## Modelling

In [17]:
multi_linreg = MultiOutputRegressor(LinearRegression()).fit(X_train_prepared, y_train)
multi_ridge = MultiOutputRegressor(RidgeCV()).fit(X_train_prepared, y_train)

In [18]:
multi_linreg.score(X_train_prepared, y_train)

0.48691750487288554

In [19]:
multi_ridge.score(X_train_prepared, y_train)

0.48586839169647833

In [20]:
y_pred = multi_ridge.predict(X_test_prepared)

In [21]:
y_test

Unnamed: 0,tc_act,x
48,0.0,15
94,2.25,15
162,1.25,15
68,1.75,25
156,0.0,15
52,0.75,15
154,2.25,5
85,0.5,15
32,1.75,25
78,1.25,5


In [22]:
y_pred

array([[ 0.37317638, 15.6525319 ],
       [ 2.27715195, 12.61235336],
       [ 1.18195356, 15.07406934],
       [ 1.43283274, 13.76565104],
       [ 0.40420573, 16.22185141],
       [ 0.74098674, 16.02086828],
       [ 2.57384423, 13.23488738],
       [ 0.61650029, 15.80343278],
       [ 1.31127849, 14.59068846],
       [ 1.28425361, 13.90294003],
       [ 0.79428405, 15.56436627],
       [ 2.87894813, 12.90177085],
       [ 2.19936224, 12.80596093],
       [ 2.87915463, 11.53922062],
       [ 0.75614014, 16.20082633],
       [ 0.90808354, 15.12991527],
       [ 1.91230768, 13.5069456 ],
       [ 0.97345265, 14.79970757],
       [ 0.58849262, 16.07737207],
       [ 2.36757567, 13.67325666],
       [ 0.40420573, 16.22185141],
       [ 1.76566564, 14.05929562],
       [ 1.48380901, 13.42244356],
       [ 2.46112535, 11.72436278],
       [ 1.10945793, 14.20156486],
       [ 1.73031807, 13.80489915],
       [ 2.15208115, 13.51989327],
       [ 2.07667999, 13.94155284],
       [ 2.1784963 ,

In [23]:
rmse_x = np.sqrt(mean_squared_error(y_test.values[:, 1], y_pred[:, 1]))
rmse_tc = np.sqrt(mean_squared_error(y_test.values[:, 0], y_pred[:, 0]))

In [24]:
rmse_x

8.154849510300615

In [25]:
rmse_tc

0.23914468589678947

In [26]:
multi_df = pd.DataFrame(columns = ['x_act', 'tc_act', 'x_pred', 'tc_pred'])

## Now Let's do it on the whole data

In [27]:
data

Unnamed: 0,x,temp,nf_hz,amp_mm,tc_act
0,5,22,40.970000,12.010000,0.00
1,5,22,38.460000,12.290000,0.50
2,5,22,38.360000,10.470000,0.51
3,5,22,38.380000,9.880000,0.52
4,5,22,37.480877,12.786667,0.75
...,...,...,...,...,...
175,25,200,32.472500,17.736250,1.50
176,25,200,30.274583,18.053958,1.75
177,25,200,28.076667,18.371667,2.00
178,25,200,25.878750,18.689375,2.25


In [28]:
# Shuffle the data
data_shuffled = data.sample(frac = 1)


# Split into target and features
X = data_shuffled.drop(['tc_act', 'x'], axis = 1)
y = data_shuffled[['tc_act', 'x']]

In [29]:
multi_ridge = RidgeCV()

multi_ridge.fit(X, y)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]))

In [30]:
multi_ridge.alpha_

10.0

In [31]:
y_pred = multi_ridge.predict(X)

In [32]:
y_pred

array([[ 0.65431563, 15.61586861],
       [ 2.77296462, 13.32480004],
       [ 0.56665673, 15.29234747],
       [ 2.86085574, 10.63701161],
       [ 2.47611683, 11.77003287],
       [ 1.26768165, 15.07281966],
       [ 0.35778531, 15.36515669],
       [ 0.65144759, 16.13935555],
       [ 1.04281957, 14.41190122],
       [ 1.01637738, 14.09102295],
       [ 2.35152485, 14.40534041],
       [ 0.68440406, 15.44850538],
       [ 0.89864476, 14.90198181],
       [ 0.61205343, 16.61829489],
       [ 1.7979565 , 15.02722579],
       [ 0.93168413, 15.14987815],
       [ 0.35778531, 15.36515669],
       [ 0.59248989, 16.90099022],
       [ 0.51011037, 15.64809515],
       [ 1.69105658, 12.8936918 ],
       [ 1.94093802, 13.01372152],
       [ 0.35829569, 15.33966566],
       [ 1.25018318, 13.67444966],
       [ 1.28452438, 13.5190989 ],
       [ 2.54688226, 15.2647713 ],
       [ 0.5982774 , 16.69034286],
       [ 0.83031128, 15.00412343],
       [ 0.71315629, 17.69834042],
       [ 2.80065649,

In [33]:
x_pred = y_pred[:, 1]
tc_pred = y_pred[:, 0]

In [34]:
multi_df = pd.DataFrame(columns = ['tc_act', 'x_act', 'tc_pred', 'x_pred'])

In [35]:
multi_df['tc_act'] = data_shuffled['tc_act']
multi_df['x_act'] = data_shuffled['x']
multi_df['tc_pred'] = tc_pred
multi_df['x_pred'] = x_pred
multi_df['temp'] = data_shuffled['temp']

In [36]:
multi_df

Unnamed: 0,tc_act,x_act,tc_pred,x_pred,temp
121,0.50,15,0.654316,15.615869,150
107,2.50,25,2.772965,13.324800,100
62,0.51,25,0.566657,15.292347,50
47,2.50,5,2.860856,10.637012,50
82,2.25,5,2.476117,11.770033,100
...,...,...,...,...,...
136,0.75,25,0.527876,17.082593,150
56,1.75,15,1.597938,13.318498,50
163,1.50,15,1.378285,14.161011,200
147,0.52,5,0.674495,16.749343,200


In [37]:
fig1 = alt.Chart(multi_df).mark_circle(opacity = 0.5).encode(
    x = alt.X('tc_act:Q', title = 'Actual Crack Depth (mm)'),
    y = alt.Y('tc_pred:Q', title = 'Predicted Crack Depth (mm)'),
    color = alt.Color('temp:O', title = 'Temperature (C)', scale = alt.Scale(scheme = 'reds'))
).properties(
    width = 800,
    height = 400,
    title = 'Multi-output Regression Prediction Accuracy (Aluminium) (Crack Depth)'
)

save(fig1, fp = str(FIGURES/'multi_output_crack_depth.png'), fmt = 'png', method = 'selenium', scale_factor = 6.0)

In [38]:
fig1

In [39]:
fig2 = alt.Chart(multi_df).mark_circle(opacity = 0.5).encode(
    x = alt.X('x_act:Q', title = 'Actual Crack Position (mm)'),
    y = alt.Y('x_pred:Q', title = 'Predicted Crack Position (mm)'),
    color = alt.Color('temp:O', title = 'Temperature (C)', scale = alt.Scale(scheme = 'reds'))
).properties(
    width = 800,
    height = 400,
    title = 'Multi-output Regression Prediction Accuracy (Aluminium) (Crack Position)'
)

save(fig2, fp = str(FIGURES/'multi_output_crack_position.png'), fmt = 'png', method = 'selenium', scale_factor = 6.0)

In [40]:
fig2

In [41]:
rmse_dict = {'Single Prediction': 0.179, 'Multi-Output Prediction': 0.209}

In [42]:
rmse_df = pd.DataFrame({'output': ['Single Prediction', 'Multi-Output Prediction'], 'RMSE': [0.179, 0.209]})

In [43]:
rmse_df

Unnamed: 0,output,RMSE
0,Single Prediction,0.179
1,Multi-Output Prediction,0.209


In [44]:
fig3 = alt.Chart(rmse_df).mark_bar(size = 100).encode(
    x = alt.X('output:N', title = 'Model Prediction Mode', axis = None),
    y = alt.Y('RMSE:Q', title = 'Root Mean Squared Error (mm)'),
    color = alt.Color('output:N', title = 'Prediction Mode')
).properties(
    width = 500,
    height = 400,
    title = 'Single vs Multi-Output Predictive Accuracy (Aluminium) (Crack Depth)'
)

save(fig3, fp = str(FIGURES/'multi_output_accuracy.png'), fmt = 'png', method = 'selenium', scale_factor = 6.0)

In [45]:
fig3

In [46]:
# Recreate ML vs original model image in altair (final_rmse_comparison.png)

comp_df = pd.DataFrame({'model': ['Referenced Polynomial Model', 'Multiple Ridge Regression Model'], 'rmse': [0.799, 0.179]})

fig4 = alt.Chart(comp_df).mark_bar(size = 100).encode(
    x = alt.X('model:N', title = 'Model', sort = '-y', axis = None),
    y = alt.Y('rmse:Q', title = 'Root Mean Squared Error (mm)', scale = alt.Scale(domain = (0.0, 1.0))),
    color = alt.Color('model:N', title = 'Model')
).properties(
    width = 500,
    height = 400,
    title = 'RMSE Comparison (Aluminium) (Crack Depth)'
)

save(fig4, fp = str(FIGURES/'final_rmse_comparison.png'), fmt = 'png', method = 'selenium', scale_factor = 6.0)

In [47]:
fig4

In [48]:
comp_df_b = pd.DataFrame({'model': ['Traditional Curve Fitting', 'Multiple Ridge Regression Model'], 'rmse': [0.799, 0.179]})

fig4b = alt.Chart(comp_df_b).mark_bar(size = 100).encode(
    x = alt.X('model:N', title = 'Model', sort = '-y', axis = None),
    y = alt.Y('rmse:Q', title = 'Root Mean Squared Error (mm)', scale = alt.Scale(domain = (0.0, 1.0))),
    color = alt.Color('model:N', title = 'Model')
).properties(
    width = 500,
    height = 400,
    title = 'RMSE Comparison (Aluminium) (Crack Depth)'
)

In [49]:
fig4b

In [50]:
save(fig4b, fp = str(FIGURES/'ml_vs_curve_fitting.png'), fmt = 'png', method = 'selenium', scale_factor = 6.0)