## FPoliSolutions, LLC; Asset Monitoring and Predictive Maintenance

## Performance and validation

## Arnab Dey Sarkar

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold

In [16]:
import statsmodels.formula.api as smf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [17]:
df = pd.read_csv("training_data.csv")

In [18]:
df=df.dropna().copy()

In [19]:
input_names = df.drop(columns=['Y']).columns
lf= df.copy().\
reset_index().\
rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid', 'Y'], value_vars=input_names, ignore_index=True)

In [20]:
high_skew_inputs = input_names[ np.abs(df.drop(columns=['Y']).skew()) > 1 ]

In [21]:
high_skew_inputs

Index(['Z07', 'Z08', 'Z09', 'V02', 'V11', 'V27', 'V28', 'V29'], dtype='object')

In [22]:
# check if the values are negative because the log and square root transformations cannot be applied to negative values
lf.loc[ (lf.variable.isin(high_skew_inputs)) & (lf.value < 0), :].groupby(['variable']).size()

variable
V02    110
V11    119
V29    215
Z09    136
dtype: int64

In [23]:
lf_skew = lf.loc[ lf.variable.isin(high_skew_inputs), :].copy()

lf_skew = lf_skew.loc[ ~lf_skew.variable.isin(['V02', 'V11', 'V29', 'Z09']), :].copy()

In [24]:
lf_skew['shift_value'] = 1 - lf_skew.value
lf_skew['log_shift_value'] = np.log( lf_skew.shift_value )

In [25]:
# Convert to short format using pivot
data = lf_skew.pivot(index='rowid', columns='variable', values='log_shift_value').reset_index()
data = data.drop(columns=['rowid'])

In [26]:
df_transformed = df.copy()
df_transformed = df_transformed.drop(columns=['Z07','Z08','V27','V28'])


In [27]:
df_transformed['Z07'] = data['Z07']
df_transformed['Z08'] = data['Z08']
df_transformed['V27'] = data['V27']
df_transformed['V28'] = data['V28']

**First Model**

Let us first start with Elastic net on the entire feature variables. We will apply the elastic net regularization.

In [28]:
xinputs = df.drop(columns=['Y'].copy())
youtput = df.loc[:, ['Y']].copy()

In [29]:
#Convert the DataFrames to NumPy arrays.
x_train = xinputs.to_numpy()
y_train = youtput.Y.to_numpy().ravel()

In [31]:
enet_to_fit = LogisticRegression(penalty='elasticnet', solver='saga', random_state=202, max_iter=25001, fit_intercept=True)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=101)

In [32]:
kf.get_n_splits()

5

In [33]:
enet_wflow_df = Pipeline(steps=[('std_inputs', StandardScaler()), 
                             ('enet', enet_to_fit)] )

In [35]:
enet_wflow_df

In [36]:
enet_grid = {'enet__C': np.exp( np.linspace(-10, 10, num=17) ),
             'enet__l1_ratio': np.linspace(0, 1, num=3)}

In [37]:
enet_search = GridSearchCV(enet_wflow_df, param_grid=enet_grid, cv=kf)

In [38]:
enet_search_results = enet_search.fit( x_train, y_train )

In [39]:
enet_search_results.best_params_

{'enet__C': 0.0820849986238988, 'enet__l1_ratio': 0.0}

Basically Ridge because of the correlation factor. This indicates that we should try PCA as well.

In [40]:
enet_search_results.best_score_

0.7533333333333333

**Second Model(Improved version)**

We will apply PCA to the original inputs and cosider additive version. We will use lasso because we are anyway going to apply PCA which will make the features uncorrelated and lasso will help reducing the number of effective features. We can start with elastic net as well but we are trying to reduce the complexity.

In [41]:
lasso_to_fit = LogisticRegression(penalty='l1', solver='saga', random_state=202, max_iter=25001, fit_intercept=True)

In [42]:
pc_wflow_df = Pipeline( steps=[('std_inputs', StandardScaler()), 
                                  ('pca', PCA()),
                              ('lasso', lasso_to_fit)] )

In [43]:
pc_wflow_df 

In [44]:
pc_grid = {'pca__n_components': [3, 5, 7, 9, 11, 13, 15, 17],
                 'lasso__C': np.exp( np.linspace(-10, 10, num=17) ) }

In [45]:
pc_df_search = GridSearchCV(pc_wflow_df, param_grid=pc_grid, cv=kf)

In [46]:
pc_df_search_results = pc_df_search.fit( x_train, y_train )

In [47]:
pc_df_search_results.best_params_

{'lasso__C': 42.52108200006278, 'pca__n_components': 13}

In [48]:
pc_df_search_results.best_score_

0.775959595959596

The accuracy increased a bit. Now further modification.

**Third Model** 
Apply not only PCA but also interaction.

In [49]:
xinputs_transformed = df_transformed.select_dtypes('number').copy()
youtput_transformed = df_transformed.loc[:, ['Y']].copy()

In [50]:
#Convert the DataFrames to NumPy arrays.
x_train_transformed = xinputs_transformed.to_numpy()
y_train_transformed = youtput_transformed.Y.to_numpy().ravel()

In [51]:
PolynomialFeatures(degree=2, interaction_only=True, include_bias=False).fit_transform( x_train_transformed ).shape

(223, 2080)

In [52]:
make_pairs = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

In [53]:
pc_interact_lasso_wflow = Pipeline( steps=[('std_inputs', StandardScaler() ), 
                                           ('pca', PCA() ), 
                                           ('make_pairs', make_pairs), 
                                           ('lasso', lasso_to_fit )] )

In [54]:
pc_interact_lasso_wflow

In [55]:
pc_interact_lasso_search = GridSearchCV( pc_interact_lasso_wflow, param_grid=pc_grid, cv=kf )

In [56]:
pc_interact_lasso_search_grid = pc_interact_lasso_search.fit( x_train_transformed, y_train_transformed )

In [57]:
pc_interact_lasso_search_grid.best_params_

{'lasso__C': 0.2865047968601901, 'pca__n_components': 11}

In [58]:
coef = pc_interact_lasso_search_grid.best_estimator_.named_steps['lasso'].coef_

In [59]:
non_empty_elements = coef[coef != 0]

In [None]:
#How many regression coefficients are associated with the best model?
non_empty_elements.size

41

In [60]:
# model 3-Apply PCA to the transformed inputs and create all pairwise interactions between the PCs
pc_interact_lasso_search_grid.best_score_

0.8387878787878786

In [61]:
# Model 2 - Apply PCA to the original inputs and create linear additive features 
pc_df_search_results.best_score_

0.775959595959596

In [62]:
# Model 1 - Linear additive features using the original inputs
enet_search_results.best_score_

0.7533333333333333

Which model is the BEST according to CROSS-VALIDATION?

Ans: The third model or mod_07 where we applied PCA to the transformed inputs and create all pairwise interactions between the PCs is the best model

• Is this model DIFFERENT from the model identified as the BEST according
to the training set?

Ans: No

• How many regression coefficients are associated with the best model?

Ans: 41

Note : I used pipeline so to decide the accuracy.