In [1]:
from src.config import FINAL_DATA, PROJECT_ROOT
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from src.models.model_metrics import score_model

import altair as alt
from altair_saver import save

In [14]:
from src.config import FIGURES

In [2]:
data = pd.read_csv(FINAL_DATA/'al_data_final.csv')

In [3]:
data

Unnamed: 0,x,temp,nf_hz,amp_mm,tc_act
0,5,22,40.970000,12.010000,0.00
1,5,22,38.460000,12.290000,0.50
2,5,22,38.360000,10.470000,0.51
3,5,22,38.380000,9.880000,0.52
4,5,22,37.480877,12.786667,0.75
...,...,...,...,...,...
175,25,200,32.472500,17.736250,1.50
176,25,200,30.274583,18.053958,1.75
177,25,200,28.076667,18.371667,2.00
178,25,200,25.878750,18.689375,2.25


In [4]:
X = data.drop('tc_act', axis = 1)
y = data['tc_act']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, stratify = data['x'])

In [6]:
forest = RandomForestRegressor(oob_score = True)
forest.fit(X_train, y_train)



RandomForestRegressor(oob_score=True)

In [7]:
score_model(forest, X_train, y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.058
R^2 (No Val): 0.994

Validation Scores
-----------
RMSE's: [0.125 0.157 0.171 0.157 0.162]
Mean: 0.154
StDev: 0.016
CoV: 0.1


(0.05753537176458936,
 0.9941547981505521,
 array([0.12516123, 0.15738622, 0.1709453 , 0.15671954, 0.16215219]),
 0.15447289577268364,
 0.015511414726068174,
 0.1004151223324911)

In [8]:
forest.feature_importances_

array([0.00800505, 0.02473433, 0.93955137, 0.02770925])

In [9]:
features = ['Crack Location', 'Temperature', 'Natural Frequency', 'Amplitude']
importances = forest.feature_importances_

feature_importance_dict = dict(zip(features, importances))


In [10]:
feature_importance_dict

{'Crack Location': 0.008005053356723303,
 'Temperature': 0.02473433212025866,
 'Natural Frequency': 0.9395513650223033,
 'Amplitude': 0.027709249500714718}

In [11]:
imp = pd.DataFrame(feature_importance_dict.items(), columns = ['feature', 'importance'])

In [12]:
imp

Unnamed: 0,feature,importance
0,Crack Location,0.008005
1,Temperature,0.024734
2,Natural Frequency,0.939551
3,Amplitude,0.027709


In [13]:
chart = alt.Chart(imp).mark_bar().encode(
    alt.Y('feature:N', title = 'Feature', sort = '-x'),
    alt.X('importance:Q', title = 'Relative Importance')
).properties(title = 'Calculated Feature Importances in Predicting Crack Depth', width = 800, height = 200)

chart

In [15]:
save(chart, fp = str(FIGURES/'feature_importances_high_res.png'), fmt = 'png', method = 'selenium', scale_factor = 6.0)

## Compare to Linear Regression Coefs

Let's use statsmodels for this as it gives a bit more verbose output

In [None]:
import statsmodels.api as sm

In [None]:
data

In [None]:
mod = sm.OLS(y, X)

res = mod.fit()

In [None]:
print(res.summary())

Could use statsmodels to get easy coefs from the poly ridge?