##### <b><i>Google collab</i></b> notebook to find wich algorithm and parameters are more <br> suitable to a classification problem (faulty/not faulty) based on equipment monitoring data

In [2]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from pathlib import Path
import pickle

#### Load Dataset

In [3]:
# assign folder to read the dataset
folder_cwd = Path.cwd().parent / 'data'
file = folder_cwd / 'equipment_monitoring_data.csv'

# Read csv file
dataset = pd.read_csv(file, delimiter=',')

In [4]:
# Show some rows of the dataset
dataset.head()

Unnamed: 0,temperature,pressure,vibration,humidity,faulty
0,58.18018,25.029278,0.606516,45.694907,0.0
1,75.740712,22.954018,2.338095,41.867407,0.0
2,71.358594,27.27683,1.389198,58.954409,0.0
3,71.616985,32.242921,1.77069,40.565138,0.0
4,66.506832,45.197471,0.345398,43.253795,0.0


#### Segregate dataset for trainning and testing holdout and <br>choose cross validation parameters

In [6]:
test_size = 0.30 # tamanho do conjunto de teste
seed = 8 # random seed

# Separation of trainning and testing dataset
array = dataset.values
X = array[:,0:-1]
y = array[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=test_size, shuffle=True, random_state=seed, stratify=y) # holdout with stratification

# Cross validation parameters
scoring = 'accuracy'
num_folds = 10
kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed) # cross validation with stratification

### <b><i> Model and Inference </b></i>

#### Creation and validation of models: baseline

In [7]:
# global seed
np.random.seed(8) 

# List to append models
models = []

# Creating models and append to the models list
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# Lists to append the results
results = []
names = []

# Models evaluation
for name, model in models:
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = f"{name}: {cv_results.mean():.5f} ({cv_results.std():.5f})"
    print(msg)

KNN: 0.97020 (0.00601)
CART: 0.96797 (0.00715)
NB: 0.98566 (0.00540)
SVM: 0.97374 (0.00609)


In [None]:
# Boxplot to compare the models

dict_models = {name: result for name, result in zip([name[0] for name in models],results)}
df_models = pd.DataFrame(dict_models)
df_models = df_models.melt(var_name = 'Estimator', 
               value_name='Cross validation score')

fig = px.box(df_models,x='Estimator',
             y='Cross validation score', title='Model Comparison')
fig.update_layout(width=550, height = 400)
fig.update_layout()

<blockquote> Comment: Between the methods that have been chosen, in a first glance,
Naive Bayes has the higher average cross validation accuracy and the lower standard deviation </blockquote>

#### Model creation and evaluation: standardized and normalized data

In [8]:
# Definition of a global seed
np.random.seed(8) 

# # list to store machine learning pipelines and their evaluation results across all dataset configurations for comparative analysis
pipelines = []
results = []
names = []


# Creating pipeline elements

# Classifier algorithms 
knn = ('KNN', KNeighborsClassifier())
cart = ('CART', DecisionTreeClassifier())
naive_bayes = ('NB', GaussianNB())
svm = ('SVM', SVC())

# Standardization and Normalization preprocessing
standard_scaler = ('StandardScaler', StandardScaler())
min_max_scaler = ('MinMaxScaler', MinMaxScaler())


# Pipeline definitions

# Original dataset
pipelines.append(('KNN-orig', Pipeline([knn])))
pipelines.append(('CART-orig', Pipeline([cart])))
pipelines.append(('NB-orig', Pipeline([naive_bayes])))
pipelines.append(('SVM-orig', Pipeline([svm])))

# Standardized dataset
pipelines.append(('KNN-std', Pipeline([standard_scaler, knn])))
pipelines.append(('CART-std', Pipeline([standard_scaler, cart])))
pipelines.append(('NB-std', Pipeline([standard_scaler, naive_bayes])))
pipelines.append(('SVM-std', Pipeline([standard_scaler, svm])))

# Normalized dataset
pipelines.append(('KNN-norm', Pipeline([min_max_scaler, knn])))
pipelines.append(('CART-norm', Pipeline([min_max_scaler, cart])))
pipelines.append(('NB-norm', Pipeline([min_max_scaler, naive_bayes])))
pipelines.append(('SVM-norm', Pipeline([min_max_scaler, svm])))

# Pipelines execution
for name, model in pipelines:
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = f"{name}: {cv_results.mean():.4f}({cv_results.std():.4f})"
    print(msg)

KNN-orig: 0.9702(0.0060)
CART-orig: 0.9680(0.0072)
NB-orig: 0.9857(0.0054)
SVM-orig: 0.9737(0.0061)
KNN-std: 0.9793(0.0055)
CART-std: 0.9680(0.0070)
NB-std: 0.9857(0.0054)
SVM-std: 0.9847(0.0056)
KNN-norm: 0.9786(0.0057)
CART-norm: 0.9678(0.0070)
NB-norm: 0.9857(0.0054)
SVM-norm: 0.9851(0.0053)


In [9]:
# Boxplot for model comparison
dict_models = {name: result for name, result in zip([model[0] for model in pipelines],results)}
df_models = pd.DataFrame(dict_models)
df_models = df_models.melt(var_name = 'Estimator', value_name='Cross validation score')

fig = px.box(df_models, x="Estimator",y="Cross validation score",
             title='Model Comparison - Original, Standardized and Normalized dataset')
fig.update_layout(width=800,height=400)
fig.show()


<blockquote>After comparison between normalized and standardized to each classifier algorithm chosen
(KNN, CART, NV and NB), the Naive Bayes is still the higher cross validation accuracy with lower standard deviation. Also the classifier for this dataset is insensitive to normalization and standardization. </blockquote>

#### Hyperparameters optmization

In [10]:
# NB tunning

np.random.seed(8) # Definition of a global seed

pipelines = []

# Pipeline component definition
nb = ('NB', GaussianNB())
standard_scaler = ('StandardScaler', StandardScaler())
min_max_scaler = ('MinMaxScaler', MinMaxScaler())

pipelines.append(('NB-orig', Pipeline(steps=[nb])))
pipelines.append(('NB-padr', Pipeline(steps=[standard_scaler, nb])))
pipelines.append(('NB-norm', Pipeline(steps=[min_max_scaler, nb])))

param_grid_nb = {
    "NB__var_smoothing": np.logspace(0,-9,100),
}

# list of best grid parameters
grid_param = []

# Prepare and execute GridSearchCV
for name, model in pipelines:
    grid = GridSearchCV(estimator=model, param_grid=param_grid_nb, scoring=scoring, cv=kfold)
    grid.fit(X_train, y_train)
    # Print best configuration
    print(f"Best grid: {name}: {grid.best_score_:.5f} score with {grid.best_params_['NB__var_smoothing']:.5f} as parameter")
    # store best paramenter for each model
    grid_param.append(grid.best_params_['NB__var_smoothing'])

Best grid: NB-orig: 0.98603 score with 0.00012 as parameter
Best grid: NB-padr: 0.98603 score with 0.04329 as parameter
Best grid: NB-norm: 0.98603 score with 0.03511 as parameter


<blockquote>Best grid for Naive Bayes classifier shown similar results for original, standardized and normalized scaler. Hence, it has been chosen original Nive Bayes with smoothing factor parameter as 0.00012 to be carried on to finalize the model</blockquote>

### <i><b> Model Finalization </i></b>
##### Train normalized x non-normalized dataset 

In [18]:
# Pipeline and model preparation - with dataset normalization
model = GaussianNB(var_smoothing= grid_param[0])
pipeline = Pipeline(steps=[("NB",model)])
scaler = MinMaxScaler().fit(X_train) # scale adjustment with training dataset
rescaledX_train = scaler.transform(X_train) # normalization of trainning dataset
pipeline.fit(rescaledX_train,y_train)

# Accuracy estimative for the test dataset
rescaledX_test = scaler.transform(X_test) # # normalization of test dataset
predictions = pipeline.predict(rescaledX_test)
print(accuracy_score(y_test, predictions))

0.9847958297132928


In [17]:
# Pipeline and model preparation - without dataset normalization
model = GaussianNB(var_smoothing= grid_param[0])
pipeline = Pipeline(steps=[("NB",model)])

# Accuracy estimative for the test dataset
pipeline.fit(X_train,y_train)
predictions = pipeline.predict(X_test)
print(accuracy_score(y_test, predictions))

0.9852302345786272


##### Save model and export golden dataset for test purpose

In [None]:
## Saving model
file = Path.cwd().parent / 'models' / 'nb_diagnosis_classifier.pkl'
pickle_out = open(file,'wb')
pickle.dump(model,pickle_out)
pickle_out.close()

In [18]:
## Saving pipeline
file = Path.cwd().parent / 'pipeline' / 'nb_diagnosis_pipeline.pkl'
pickle_out = open(file,'wb')
pickle.dump(pipeline,pickle_out)
pickle_out.close()

In [19]:
# golden dataset export
file_golden = Path.cwd().parent / 'data' / 'golden_dataset.csv'

test_array = np.concatenate((X_test,y_test.reshape(len(y_test),1)),axis=1)
df_golden = pd.DataFrame(test_array, columns=dataset.columns)

df_golden.to_csv(file_golden, sep=',', index=False)

#### <b><i>Conclusions and General comments</i></b>

<blocknote>For the dataset that has been chosen Naive Bayes classifier has the higher accuracy and lower standard deviation according to cross-validation method. Also normalalization and standardization hasn't any effecto to improve the classifier accuracy, and even dataset normalization either. The explanation can may arise from the simplified dataset which does not have many parameters and a gaussian curve may be already to close to the behaviour of classification problem.</blocknote>    

#### <b><i>Reflections on data Security</i></b>

<blocknote>If this were a dataset with personnel or industrial information, some techniques should have been applied for protection. The dataset could have acess control with multi-factor authentication and captcha verification. Sensitive data that is not essential for the classification algorithm, such as the location and name of the company where the equipment is being monitored, should be anonymized. These are only some examples of implementation for security data protection. </blocknote>    