## FPoliSolutions, LLC; Asset Monitoring and Predictive Maintenance

## Bonus
### Arnab Dey Sarkar

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV

In [3]:
import statsmodels.formula.api as smf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from patsy import dmatrices

In [4]:
df = pd.read_csv("training_data.csv")

#### Loading data

In [5]:
df=df.dropna().copy()

In [6]:
input_names = df.drop(columns=['Y']).columns
lf= df.copy().\
reset_index().\
rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid', 'Y'], value_vars=input_names, ignore_index=True)

In [7]:
high_skew_inputs = input_names[ np.abs(df.drop(columns=['Y']).skew()) > 1 ]

In [8]:
high_skew_inputs

Index(['Z07', 'Z08', 'Z09', 'V02', 'V11', 'V27', 'V28', 'V29'], dtype='object')

In [9]:
# check if the values are negative because the log and square root transformations cannot be applied to negative values
lf.loc[ (lf.variable.isin(high_skew_inputs)) & (lf.value < 0), :].groupby(['variable']).size()

variable
V02    110
V11    119
V29    215
Z09    136
dtype: int64

In [10]:
lf_skew = lf.loc[ lf.variable.isin(high_skew_inputs), :].copy()

lf_skew = lf_skew.loc[ ~lf_skew.variable.isin(['V02', 'V11', 'V29', 'Z09']), :].copy()

In [11]:
lf_skew['shift_value'] = 1 - lf_skew.value
lf_skew['log_shift_value'] = np.log( lf_skew.shift_value )

In [12]:
# Convert to short format using pivot
data = lf_skew.pivot(index='rowid', columns='variable', values='log_shift_value').reset_index()
data = data.drop(columns=['rowid'])

In [13]:
df_transformed = df.copy()
df_transformed = df_transformed.drop(columns=['Z07','Z08','V27','V28'])


In [14]:
df_transformed['Z07'] = data['Z07']
df_transformed['Z08'] = data['Z08']
df_transformed['V27'] = data['V27']
df_transformed['V28'] = data['V28']

**Complex Model** PCA and interaction

In [15]:
xinputs_transformed = df_transformed.select_dtypes('number').copy()
youtput_transformed = df_transformed.loc[:, ['Y']].copy()

In [16]:
#Convert the DataFrames to NumPy arrays.
x_train_transformed = xinputs_transformed.to_numpy()
y_train_transformed = youtput_transformed.Y.to_numpy().ravel()

In [17]:
PolynomialFeatures(degree=2, interaction_only=True, include_bias=False).fit_transform( x_train_transformed ).shape

(223, 2080)

In [18]:
make_pairs = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

In [19]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=9483156)

# Ridge

* Apply the RIDGE penalty to the MOST complex model you fit even if CROSS-VALIDATION says it is NOT the best model.
* Train and tune a RIDGE penalized model via CROSS-VALIDATION.
* Examine the final tuned coefficient estimates.

In [37]:
ridge_tune = LogisticRegressionCV(penalty='l2',Cs=101, solver='lbfgs',
                                  max_iter=25001, cv=kf, fit_intercept=False)

In [21]:
pc_interact_ridge_wflow = Pipeline( steps=[('std_inputs', StandardScaler() ), 
                                           ('pca', PCA() ), 
                                           ('make_pairs', make_pairs), 
                                           ('ridge', ridge_tune )] )

In [22]:
pc_interact_ridge_wflow

In [23]:
ridge_grid = {'pca__n_components': [3, 5, 7, 9, 11, 13, 15, 17]}  

In [24]:
ridge_search = GridSearchCV(pc_interact_ridge_wflow, param_grid=ridge_grid, cv=kf)

In [25]:
ridge_search_results = ridge_search.fit( x_train_transformed, y_train_transformed )

In [26]:
#The optimal no. of pca components is 
ridge_search_results.best_params_

{'pca__n_components': 15}

In [27]:
#The optimal value for C
ridge_search_results.best_estimator_.named_steps['ridge'].C_

array([0.03630781])

In [28]:
ridge_search_results.best_score_

0.8341414141414141

In [29]:
ridge_search_results.best_estimator_.named_steps['ridge'].Cs_

array([1.00000000e-04, 1.20226443e-04, 1.44543977e-04, 1.73780083e-04,
       2.08929613e-04, 2.51188643e-04, 3.01995172e-04, 3.63078055e-04,
       4.36515832e-04, 5.24807460e-04, 6.30957344e-04, 7.58577575e-04,
       9.12010839e-04, 1.09647820e-03, 1.31825674e-03, 1.58489319e-03,
       1.90546072e-03, 2.29086765e-03, 2.75422870e-03, 3.31131121e-03,
       3.98107171e-03, 4.78630092e-03, 5.75439937e-03, 6.91830971e-03,
       8.31763771e-03, 1.00000000e-02, 1.20226443e-02, 1.44543977e-02,
       1.73780083e-02, 2.08929613e-02, 2.51188643e-02, 3.01995172e-02,
       3.63078055e-02, 4.36515832e-02, 5.24807460e-02, 6.30957344e-02,
       7.58577575e-02, 9.12010839e-02, 1.09647820e-01, 1.31825674e-01,
       1.58489319e-01, 1.90546072e-01, 2.29086765e-01, 2.75422870e-01,
       3.31131121e-01, 3.98107171e-01, 4.78630092e-01, 5.75439937e-01,
       6.91830971e-01, 8.31763771e-01, 1.00000000e+00, 1.20226443e+00,
       1.44543977e+00, 1.73780083e+00, 2.08929613e+00, 2.51188643e+00,
      

# Lasso
* Apply the Lasso penalty to the MOST complex model you fit even if CROSS-VALIDATION says it is NOT the best model.
* Train and tune a LASSO penalized model via CROSS-VALIDATION.
* Examine the final tuned coefficients estimates – are any zero?

In [31]:
lasso_to_fit = LogisticRegression(penalty='l1', solver='saga', random_state=202, max_iter=25001, fit_intercept=True)

In [32]:
pc_interact_lasso_wflow = Pipeline( steps=[('std_inputs', StandardScaler() ), 
                                           ('pca', PCA() ), 
                                           ('make_pairs', make_pairs), 
                                           ('lasso', lasso_to_fit )] )

In [33]:
pc_interact_lasso_wflow

In [34]:
lasso_grid = {'pca__n_components': [3, 5, 7, 9, 11, 13, 15, 17],
                 'lasso__C': np.exp( np.linspace(-10, 10, num=17) ) }

In [35]:
pc_interact_lasso_search = GridSearchCV( pc_interact_lasso_wflow, param_grid=lasso_grid, cv=kf )

In [36]:
pc_interact_lasso_search_grid = pc_interact_lasso_search.fit( x_train_transformed, y_train_transformed )

In [38]:
#The optimal value for C and no. of pca components is 
pc_interact_lasso_search_grid.best_params_

{'lasso__C': 3.4903429574618414, 'pca__n_components': 17}

In [39]:
pc_interact_lasso_search_grid.best_score_

0.8385858585858585

In [40]:
#Examine the final tuned coefficients estimates – are any zero?
pc_interact_lasso_search_grid.best_estimator_.named_steps['lasso'].coef_
coef = pc_interact_lasso_search_grid.best_estimator_.named_steps['lasso'].coef_

In [41]:
#Examine the final tuned coefficients estimates – are any zero?
empty_elements = coef[coef == 0]
empty_elements.size

8

There are 8 zero coefficients.

# Elastic net
* Apply the ELASTIC NET penalty to the MOST complex model you fit even if CROSS-VALIDATION says it is NOT the best model.
* Train and tune an ELASTIC NET penalized model by optimizing BOTH tuning parameters via CROSS-VALIDATION.
* Examine the final tuned coefficients estimates. Is the tuned model closer to RIDGE or LASSO? Are the final tuned coefficients zero?

In [42]:
enet_to_fit = LogisticRegression(penalty='elasticnet', solver='saga',
                            random_state=202, max_iter=25001, fit_intercept=True)

In [43]:
pc_interact_enet_wflow = Pipeline( steps=[('std_inputs', StandardScaler() ), 
                                           ('pca', PCA() ), 
                                           ('make_pairs', make_pairs), 
                                           ('enet', enet_to_fit )] )

In [44]:
enet_grid = {'pca__n_components': [3, 5, 7, 9, 11, 13, 15, 17],
             'enet__C': np.exp( np.linspace(-10, 10, num=17)),
             'enet__l1_ratio': np.linspace(0, 1, num=3)}

In [45]:
pc_df_enet_search = GridSearchCV(pc_interact_enet_wflow, param_grid=enet_grid, cv=kf)

In [46]:
pc_df__enet_search_results = pc_df_enet_search.fit( x_train_transformed, y_train_transformed )

In [47]:
#The optimal value for C and no. of pca components is 
pc_df__enet_search_results.best_params_

{'enet__C': 0.0820849986238988, 'enet__l1_ratio': 0.5, 'pca__n_components': 13}

Is the tuned model closer to RIDGE or LASSO? 
Elastic net converted to RIDGE 


In [48]:
pc_df__enet_search_results.best_score_

0.847171717171717

Are the final tuned coefficients zero?

In [49]:
coef = pc_df__enet_search_results.best_estimator_.named_steps['enet'].coef_
print(coef)

[[-4.53845248e-01  1.47427496e-01 -1.94881960e-01 -3.44394247e-01
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.69535094e-01  0.00000000e+00
  -4.54154127e-01  0.00000000e+00 -9.89416723e-03 -7.39555218e-02
  -3.83102331e-03  9.99220618e-03 -2.96956430e-02  0.00000000e+00
  -2.40718588e-04 -5.77318060e-02  5.97884519e-02 -6.66107001e-03
  -6.70142466e-02  1.30810857e-02  0.00000000e+00  1.73396628e-02
  -1.69113372e-02 -6.73004737e-02 -5.28155814e-02  9.96589682e-02
  -2.56393971e-02  0.00000000e+00  0.00000000e+00  3.50440507e-02
  -5.36835720e-02 -9.23782654e-02  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   4.28510515e-03  2.78109857e-02 -1.55649449e-02  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.11805385e-01 -8.81721199e-03
  -1.32205799e-01 -4.39692379e-03  0.00000000e+00  0.00000000e+00
   8.51207298e-03  0.00000000e+00  0.00000000e+00  5.89032529e-02
   0.00000

In [50]:
empty_elements = coef[coef == 0]
empty_elements.size

47

There are 47 zero final tuned coefficients 

# Model of your choice
* We have focused on (generalized) linear models (linear regression and logistic regression).
* You may select a non-linear model of your choice, such as Support Vector Machines, Random forests, or neural networks.
* The syntax for fitting them is VERY similar to the scikit-learn linear model and logistic regression syntax (Module 13 and Module 14).
* Use cross-validation to tune and assess performance of that model.

#### SVM/SVC

In [60]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [52]:
svm_model = SVC()

svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

svm_grid_search = GridSearchCV(svm_model, svm_param_grid, cv=5, scoring='accuracy')


In [57]:
svm_result=svm_grid_search.fit(x_train_transformed, y_train_transformed)

In [58]:
svm_result.best_params_

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

In [59]:
svm_result.best_score_

1.0

In [61]:
svm_cross_val_scores = cross_val_score(svm_grid_search.best_estimator_, x_train_transformed, y_train_transformed, cv=5, scoring='accuracy')
print("SVM Cross-Validation Scores:", svm_cross_val_scores)
print("SVM Mean Cross-Validation Score:", svm_cross_val_scores.mean())

SVM Cross-Validation Scores: [1. 1. 1. 1. 1.]
SVM Mean Cross-Validation Score: 1.0


Neural Network

In [62]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor  # For regression tasks
from sklearn.ensemble import RandomForestClassifier  # For classification tasks
from sklearn.datasets import load_iris

In [65]:
# Appropriate model based on our task (regression/classification) is 
# RandomForestClassifier for classification(RandomForestRegressor for regression )
model = RandomForestClassifier()

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')  # Use appropriate scoring for your task

# Fit the grid search to your data
grid_search.fit(x_train_transformed, y_train_transformed)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)



Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}


In [64]:
# Assess performance using cross-validation
cross_val_scores = cross_val_score(grid_search.best_estimator_, x_train_transformed, y_train_transformed, cv=5, scoring='accuracy')  # Use appropriate scoring
print("Cross-Validation Scores:", cross_val_scores)
print("Mean Cross-Validation Score:", cross_val_scores.mean())

Cross-Validation Scores: [1.         1.         0.95555556 0.90909091 1.        ]
Mean Cross-Validation Score: 0.972929292929293
