# Notebook_13 Feature Removal

The feature importances seemed to indicate natural frequency was the most important feature for predicting crack depth. This notebook will test what happens when features are removed

In [1]:
import altair as alt
import altair_data_server
from altair_saver import save
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

sns.set(style='darkgrid', context='notebook')
%config InlineBackend.figure_format = 'retina'

alt.data_transformers.enable('data_server')

DataTransformerRegistry.enable('data_server')

In [2]:
from src.config import FINAL_DATA, FIGURES
from src.models.model_metrics import score_model

In [3]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
data = pd.read_csv(FINAL_DATA/'al_data_final.csv')

In [5]:
data

Unnamed: 0,x,temp,nf_hz,amp_mm,tc_act
0,5,22,40.970000,12.010000,0.00
1,5,22,38.460000,12.290000,0.50
2,5,22,38.360000,10.470000,0.51
3,5,22,38.380000,9.880000,0.52
4,5,22,37.480877,12.786667,0.75
...,...,...,...,...,...
175,25,200,32.472500,17.736250,1.50
176,25,200,30.274583,18.053958,1.75
177,25,200,28.076667,18.371667,2.00
178,25,200,25.878750,18.689375,2.25


In [6]:
def custom_scorer(model, X, y):
    
    val_scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error")
    val_scores = val_scores * -1
    val_rmse_scores = np.sqrt(val_scores)

    no_val_mse = mean_squared_error(y, model.predict(X))
    no_val_rmse = np.sqrt(no_val_mse)
    no_val_r2 = r2_score(y, model.predict(X))

    cv_mean = np.mean(val_rmse_scores)
    cv_std = np.std(val_rmse_scores)
    cv_cov = cv_std / cv_mean
    
    return no_val_rmse, no_val_r2, cv_mean, cv_std, cv_cov
    

In [7]:
def add_to_df(scores: list, feat: str):
    
    temp_df_dict = {'features': [feat], 'no_val_rmse': [scores[0]], 'no_val_r2': [scores[1]], 'cv_mean': [scores[2]], 'cv_std': [scores[3]], 'cv_cov': [scores[4]]}
    
    temp_df = pd.DataFrame.from_dict(temp_df_dict)
    
    global feat_comp
    
    return feat_comp.append(temp_df, ignore_index = True)
    

## All Features

First, let's train a nice stable model like a Ridge regressor on the data

In [8]:
X = data.drop('tc_act', axis = 1)
y = data['tc_act']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

In [9]:
all_feats = Ridge(alpha = 0.1)

all_feats.fit(X_train, y_train)

Ridge(alpha=0.1)

In [10]:
[*scores] = custom_scorer(all_feats, X_train, y_train)

In [11]:
scores

[0.1714072184195415,
 0.947892609459517,
 0.17605369447593205,
 0.04498147916491116,
 0.2554986380649938]

Lets make a dataframe to keep track of the scores from different runs

In [12]:
data_dict = {'features': [], 'no_val_rmse': [], 'no_val_r2': [], 'cv_mean': [], 'cv_std': [], 'cv_cov': []}

feat_comp = pd.DataFrame.from_dict(data_dict)

In [13]:
feat_comp

Unnamed: 0,features,no_val_rmse,no_val_r2,cv_mean,cv_std,cv_cov


In [14]:
feat_comp = add_to_df(scores = scores, feat = 'All Features')

In [15]:
feat_comp

Unnamed: 0,features,no_val_rmse,no_val_r2,cv_mean,cv_std,cv_cov
0,All Features,0.171407,0.947893,0.176054,0.044981,0.255499


## Removing Amplitude

In [16]:
X = data.drop(['tc_act', 'amp_mm'], axis = 1)
y = data['tc_act']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

In [17]:
no_amp = Ridge(alpha = 0.1)

no_amp.fit(X_train, y_train)

Ridge(alpha=0.1)

In [18]:
[*scores] = custom_scorer(no_amp, X_train, y_train)

In [19]:
feat_comp = add_to_df(scores = scores, feat = 'Removed Amplitude')

In [20]:
feat_comp

Unnamed: 0,features,no_val_rmse,no_val_r2,cv_mean,cv_std,cv_cov
0,All Features,0.171407,0.947893,0.176054,0.044981,0.255499
1,Removed Amplitude,0.200248,0.931996,0.205645,0.008619,0.041913


## Removing x

In [21]:
X = data.drop(['tc_act', 'x'], axis = 1)
y = data['tc_act']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

In [22]:
no_x = Ridge(alpha = 0.1)

no_x.fit(X_train, y_train)

Ridge(alpha=0.1)

In [23]:
[*scores] = custom_scorer(no_x, X_train, y_train)

In [24]:
feat_comp = add_to_df(scores = scores, feat = 'Removed Crack Location')

In [25]:
feat_comp

Unnamed: 0,features,no_val_rmse,no_val_r2,cv_mean,cv_std,cv_cov
0,All Features,0.171407,0.947893,0.176054,0.044981,0.255499
1,Removed Amplitude,0.200248,0.931996,0.205645,0.008619,0.041913
2,Removed Crack Location,0.202331,0.930058,0.209188,0.022491,0.107516


## Removing Temp

In [26]:
X = data.drop(['tc_act', 'temp'], axis = 1)
y = data['tc_act']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

no_temp = Ridge(alpha = 0.1)

no_temp.fit(X_train, y_train)

[*scores] = custom_scorer(no_temp, X_train, y_train)

feat_comp = add_to_df(scores = scores, feat = 'Removed Temp')

feat_comp

Unnamed: 0,features,no_val_rmse,no_val_r2,cv_mean,cv_std,cv_cov
0,All Features,0.171407,0.947893,0.176054,0.044981,0.255499
1,Removed Amplitude,0.200248,0.931996,0.205645,0.008619,0.041913
2,Removed Crack Location,0.202331,0.930058,0.209188,0.022491,0.107516
3,Removed Temp,0.24685,0.892968,0.255884,0.022339,0.087302


## Removing Natural Frequency

In [27]:
X = data.drop(['tc_act', 'nf_hz'], axis = 1)
y = data['tc_act']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

no_nf = Ridge(alpha = 0.1)

no_nf.fit(X_train, y_train)

[*scores] = custom_scorer(no_nf, X_train, y_train)

feat_comp = add_to_df(scores = scores, feat = 'Removed Natural Frequency')

feat_comp

Unnamed: 0,features,no_val_rmse,no_val_r2,cv_mean,cv_std,cv_cov
0,All Features,0.171407,0.947893,0.176054,0.044981,0.255499
1,Removed Amplitude,0.200248,0.931996,0.205645,0.008619,0.041913
2,Removed Crack Location,0.202331,0.930058,0.209188,0.022491,0.107516
3,Removed Temp,0.24685,0.892968,0.255884,0.022339,0.087302
4,Removed Natural Frequency,0.354971,0.788145,0.36733,0.05631,0.153297


In [28]:
chart = alt.Chart(feat_comp).mark_bar().encode(
    x = alt.X('features:N', title = 'Feature', sort = '-y', axis = alt.Axis(labelAngle = -45)),
    y = alt.Y('cv_mean:Q', title = 'Mean Cross Validation RMSE (mm)')
).properties(
    width = 500,
    height = 400,
    title = 'Feature Removal Test Results'
)

save(chart, fp = str(FIGURES/'feature_removal_test_results.png'), fmt = 'png', method = 'selenium', scale_factor = 6.0)

In [29]:
chart