# Modelado
En este notebook se van a crear varios modelos predictivos basados a los que se aplicará un proceso de tuning basado en grid search. Estos modelos serán comparados y evaluados teniendo en cuenta las métricas establecidas en fases anteriores. Finalmente, se escogera el modelo que obtenga mejores resultados y se exportará para su despliegue en la plataforma de mantenimiento predictivo.

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib


#Funcion auxiliar para calcular estadisticos sobre los modelo entrenados
def get_confusion_matrix_values(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

## Conjuntos de entrenamiento y prueba
Para entrenar y comprobar la validez de los modelos hay que crear un cojunto de entrenamiento y otro de validación. Para realizar esta tarea hay varias estrategias como seleccionar varias muestras de forma aleatoria, o realizar un muestreo estatificado para asegurarse de que hay casos que representen todas las clases y se mantiene la proporción. Sin embargo en este caso se trata de una serie temporal, por lo que creo que lo más realista y representativo sería tener en cuenta los valores de forma secuencial, tal y como se han recogido. Por tanto, voy a tomar un 60% de las muestras para el entrenamiento y el resto lo dejaré para la validación.

In [2]:
# Cargamos los datos
dataset = pd.read_csv('../datasets/water_pump/sensor.csv')
dataset.loc[((dataset.machine_status == 'BROKEN') | (dataset.machine_status == 'RECOVERING')), 'machine_status'] = 'BROKEN'
# Eliminamos los datos de sensor 15 que solo contienen valores nulos y del indice autoincremental
train = dataset[0:round(len(dataset.index)*0.6)]
test = dataset[round(len(dataset.index)*0.6)+1:]

columns_to_drop = ['sensor_15','Unnamed: 0','timestamp']
numerical_col = ['sensor_00','sensor_01','sensor_02','sensor_03','sensor_04','sensor_05','sensor_06','sensor_07','sensor_08','sensor_09','sensor_10','sensor_11','sensor_12','sensor_13','sensor_14','sensor_16','sensor_17','sensor_18','sensor_19','sensor_20','sensor_21','sensor_22','sensor_23','sensor_24','sensor_25','sensor_26','sensor_27','sensor_28','sensor_29','sensor_30','sensor_31','sensor_32','sensor_33','sensor_34','sensor_35','sensor_36','sensor_37','sensor_38','sensor_39','sensor_40','sensor_41','sensor_42','sensor_43','sensor_44','sensor_45','sensor_46','sensor_47','sensor_48','sensor_49','sensor_50','sensor_51']
target = ['machine_status']

imputer = ColumnTransformer(
    remainder='passthrough',
    transformers=[('imputer', SimpleImputer(strategy='mean'), numerical_col)])

scaler = StandardScaler()
pca = PCA(n_components=8)

pca_pipe = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler',scaler),
            ('pca',pca)])
preprocessor = ColumnTransformer(
    remainder='passthrough',
    transformers=[('drop_col','drop',columns_to_drop),
                  ('pca',pca_pipe,numerical_col),
                 ('encoding', OneHotEncoder(),target)])

# k_best = SelectKBest(chi2, k=8)

In [4]:
# Cargamos los datos
dataset = pd.read_csv('../datasets/water_pump/sensor.csv')
dataset.loc[((dataset.machine_status == 'BROKEN') | (dataset.machine_status == 'RECOVERING')), 'machine_status'] = 'BROKEN'
round(len(dataset.index)*0.6)
train = dataset[0:round(len(dataset.index)*0.6)]
test = dataset[round(len(dataset.index)*0.6):]
#train
# test


In [33]:
aux = ''
for i in range(0,52):
    if i<10:
        aux += "\'sensor_0"+ str(i)+"\',"
    else:
        aux += "\'sensor_"+ str(i) +"\',"
print(aux)

'sensor_00','sensor_01','sensor_02','sensor_03','sensor_04','sensor_05','sensor_06','sensor_07','sensor_08','sensor_09','sensor_10','sensor_11','sensor_12','sensor_13','sensor_14','sensor_15','sensor_16','sensor_17','sensor_18','sensor_19','sensor_20','sensor_21','sensor_22','sensor_23','sensor_24','sensor_25','sensor_26','sensor_27','sensor_28','sensor_29','sensor_30','sensor_31','sensor_32','sensor_33','sensor_34','sensor_35','sensor_36','sensor_37','sensor_38','sensor_39','sensor_40','sensor_41','sensor_42','sensor_43','sensor_44','sensor_45','sensor_46','sensor_47','sensor_48','sensor_49','sensor_50','sensor_51',


## Random Forest

In [74]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

modelo = RandomForestClassifier(
                oob_score    = True,
                n_jobs       = -1,
                random_state = 123
             )

rfc_pipe= Pipeline([
            ('preprocesor',preprocessor),
            ('imputer',imputer),
            ('pca_pipe',pca_pipe)
        ])
train_rf = preprocessor.fit_transform(train)

rfc_random = RandomizedSearchCV(estimator = modelo, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=123, n_jobs = -1)
rfc_random.fit(train_rf[:,:-1],train_rf[:,-1])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 26.6min finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, oob_score=True,
                                                    random_state=123),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=123, verbose=2)

In [5]:
# Creacion del modelo
modelo_prueba = RandomForestClassifier(
                oob_score    = True,
                n_jobs       = -1,
                random_state = 123
             )
# Pipe para aplicar pca
pca_pipe = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler',scaler),
            ('pca',pca)])
# Transformer para cada tipo de columna
preprocessor = ColumnTransformer(
    remainder='passthrough',
    transformers=[('drop_col','drop',columns_to_drop),
                  ('pca',pca_pipe,numerical_col)
#                  ('encoding', OneHotEncoder(),target)
                 ])
# Pipeline para encadenar el procesado de datos y el modelo predictivo
pipe_prueba = Pipeline(steps=[
    ('preprocesor',preprocessor),
    ('modelo',modelo_prueba),
])

# # Entrenamiento del modelo predictivo
# pipe_prueba.fit(train.iloc[:,:-1],train.iloc[:,-1])


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'modelo__n_estimators': n_estimators,
               'modelo__max_features': max_features,
               'modelo__max_depth': max_depth,
               'modelo__min_samples_split': min_samples_split,
               'modelo__min_samples_leaf': min_samples_leaf,
               'modelo__bootstrap': bootstrap}
rfc_random = RandomizedSearchCV(estimator = pipe_prueba, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=123, n_jobs = -1)
rfc_random.fit(train.iloc[:,:-1],train.iloc[:,-1])


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 13.3min finished


NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [26]:
#Prediccion del conjunto de validación y comprobación de precision
rf_prediction = rfc_random.best_estimator_.predict(test.iloc[:,:-1])
# mat_confusion = confusion_matrix(
#                     y_true    = test.iloc[:,-1],
#                     y_pred    = prediction_prueba
#                 )

# accuracy = accuracy_score(
#             y_true    = test.iloc[:,-1],
#             y_pred    = prediction_prueba,
#             normalize = True
#            )

# print("Matriz de confusión")
# print("-------------------")
# print(mat_confusion)
# print("")
# print(f"El accuracy de test es: {100 * accuracy} %")

rf_mat_confusion = confusion_matrix(
                    y_true    = test.iloc[:,-1],
                    y_pred    = rf_prediction
                )

rf_accuracy = accuracy_score(
            y_true    = test.iloc[:,-1],
            y_pred    = rf_prediction,
            normalize = True
           )

print("Matriz de confusión")
print("-------------------")
print(rf_mat_confusion)
print("")
print(f"El accuracy de test es: {100 * rf_accuracy} %")

rf_TP, rf_FP, rf_FN, rf_TN = get_confusion_matrix_values(
    test.iloc[:,-1],
    rf_prediction)
print(f"True positive: {rf_TP}")
print(f"False positive: {rf_FP}")
print(f"True negative: {rf_TN}")
print(f"False negative: {rf_FN}")

from sklearn.metrics import recall_score
rf_tpr = recall_score(
    test.iloc[:,-1],
    rf_prediction,
    pos_label = 'BROKEN')   # it is better to name it y_test 
# to calculate, tnr we need to set the positive label to the other class
# I assume your negative class consists of 0, if it is -1, change 0 below to that value
rf_tnr = recall_score(
    test.iloc[:,-1],
    rf_prediction,
    pos_label = 'NORMAL') 
rf_fpr = 1 - rf_tnr
rf_fnr = 1 - rf_tpr

print(f"True positive rate: {rf_tpr}")
print(f"True negative rate: {rf_tnr}")
print(f"False positive rate: {rf_fpr}")
print(f"False negative rate: {rf_fnr}")

# rfc_random.best_estimator_.score

Matriz de confusión
-------------------
[[ 4258    99]
 [  371 83400]]

El accuracy de test es: 99.46668482207699 %
True positive: 4258
False positive: 99
True negative: 83400
False negative: 371
True positive rate: 0.9772779435391324
True negative rate: 0.9955712597438254
False positive rate: 0.004428740256174568
False negative rate: 0.022722056460867557


Pipeline(steps=[('preprocesor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('drop_col', 'drop',
                                                  ['sensor_15', 'Unnamed: 0',
                                                   'timestamp']),
                                                 ('pca',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(n_components=8))]),
                                                  ['sensor_00', 'sensor_01',
                                                   'sensor_02', 'sens

## SVM

In [4]:
from sklearn.svm import SVC

# grid_svc = {'svc__C': np.logspace(-3, 2, 6),
#             'svc__gamma': np.logspace(-3, 2, 6)}
grid_svc = {
    'svc__C':            np.arange( 1, 100+1, 1 ).tolist(),
    'svc__kernel':       ['linear', 'rbf'],                   # precomputed,'poly', 'sigmoid'
    'svc__degree':       np.arange( 0, 100+0, 1 ).tolist(),
    'svc__gamma':        np.arange( 0.0, 10.0+0.0, 0.1 ).tolist(),
    'svc__coef0':        np.arange( 0.0, 10.0+0.0, 0.1 ).tolist(),
    }

scaler2 = StandardScaler()
pca2 = PCA(n_components=8)
# Pipe para aplicar pca
pca_pipe2 = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler',scaler2),
            ('pca',pca2)])
# Transformer para cada tipo de columna
preprocessor2 = ColumnTransformer(
    remainder='passthrough',
    transformers=[('drop_col','drop',columns_to_drop),
                  ('pca',pca_pipe2,numerical_col)
#                  ('encoding', OneHotEncoder(),target)
                 ])
svc = SVC()
pipe_svc = Pipeline(steps=[
    ('preprocessor', preprocessor2),
    ('svc',svc)
])

# Entrenamiento del modelo predictivo
svc_random = RandomizedSearchCV(estimator = pipe_svc, param_distributions = grid_svc, n_iter = 20, cv = 3, verbose=2, random_state=123, n_jobs = -1)
# pipe_svc.fit(train.iloc[:,:-1],train.iloc[:,-1])
svc_random.fit(train.iloc[:,:-1],train.iloc[:,-1])

#Prediccion del conjunto de validación y comprobación de precision
# prediction_svc = pipe_svc.predict(test.iloc[:,:-1])
prediction_svc = svc_random.best_estimator_.predict(test.iloc[:,:-1])
mat_confusion_svc = confusion_matrix(
                    y_true    = test.iloc[:,-1],
                    y_pred    = prediction_svc
                )

accuracy_svc = accuracy_score(
            y_true    = test.iloc[:,-1],
            y_pred    = prediction_svc,
            normalize = True
           )

print("Matriz de confusión")
print("-------------------")
print(mat_confusion_svc)
print("")
print(f"El accuracy de test es: {100 * accuracy_svc} %")

svc_TP, svc_FP, svc_FN, svc_TN = get_confusion_matrix_values(
    test.iloc[:,-1],
    prediction_svc)
print(f"True positive: {svc_TP}")
print(f"False positive: {svc_FP}")
print(f"True negative: {svc_TN}")
print(f"False negative: {svc_FN}")

from sklearn.metrics import recall_score
svc_tpr = recall_score(
    test.iloc[:,-1],
    prediction_svc,
    pos_label = 'BROKEN')   # it is better to name it y_test 
# to calculate, tnr we need to set the positive label to the other class
# I assume your negative class consists of 0, if it is -1, change 0 below to that value
svc_tnr = recall_score(
    test.iloc[:,-1],
    prediction_svc,
    pos_label = 'NORMAL') 
svc_fpr = 1 - svc_tnr
svc_fnr = 1 - svc_tpr

print(f"True positive rate: {svc_tpr}")
print(f"True negative rate: {svc_tnr}")
print(f"False positive rate: {svc_fpr}")
print(f"False negative rate: {svc_fnr}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 66.7min finished


Matriz de confusión
-------------------
[[ 4242   114]
 [  317 83454]]

El accuracy de test es: 99.51093308520657 %
True positive: 4242
False positive: 114
True negative: 83454
False negative: 317
True positive rate: 0.9738292011019284
True negative rate: 0.9962158742285516
False positive rate: 0.0037841257714483767
False negative rate: 0.026170798898071612


## Red Neuronal

In [3]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers.core import Dense
from keras.wrappers.scikit_learn import KerasClassifier
def create_model():
    ann = Sequential()
    ann.add(Dense(512, activation='relu'))
    ann.add(Dense(512, activation='relu'))
    ann.add(Dense(256, activation='relu'))
    ann.add(Dense(1,activation='sigmoid'))
    ann.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy'])
    return ann

In [4]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers.core import Dense
from keras.wrappers.scikit_learn import KerasClassifier

train.loc[train.machine_status == 'NORMAL', 'machine_status'] = 0
train.loc[((train.machine_status == 'BROKEN') | (train.machine_status == 'RECOVERING')), 'machine_status'] = 1
test.loc[test.machine_status == 'NORMAL', 'machine_status'] = 0
test.loc[((test.machine_status == 'BROKEN') | (test.machine_status == 'RECOVERING')), 'machine_status'] = 1

scaler3 = StandardScaler()
pca3 = PCA(n_components=8)
# Pipe para aplicar pca
pca_pipe3 = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler',scaler3),
            ('pca',pca3)])
# Transformer para cada tipo de columna
preprocessor3 = ColumnTransformer(
    remainder='passthrough',
    transformers=[('drop_col','drop',columns_to_drop),
                  ('pca',pca_pipe3,numerical_col)
#                  ('encoding', OneHotEncoder(),target)
                 ])
X_train = preprocessor3.fit_transform(train.iloc[:,:-1])

ann = create_model()
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

history = ann.fit(
    X_train,np.asarray(train.iloc[:,-1]).astype('float32'),
#     validation_data=(X_test, np.array(y_test)),
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0, # hide the output because we have so many epochs
)
# ann.fit(X_train,np.array(y_train))
X_test = preprocessor3.transform(test.iloc[:,:-1])
ann_predictions = ann.predict(X_test)
ann_mat_confusion = confusion_matrix(
                    y_true    = list(test.iloc[:,-1]),
                    y_pred    = ann_predictions.round()
                )

ann_accuracy = accuracy_score(
            y_true    = list(test.iloc[:,-1]),
            y_pred    = ann_predictions.round(),
            normalize = True
           )

print("Matriz de confusión")
print("-------------------")
print(ann_mat_confusion)
print("")
print(f"El accuracy de test es: {100 * ann_accuracy} %")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s




KeyboardInterrupt: 

In [5]:
#This is the model you want. it is in sklearn format
scaler3 = StandardScaler()
pca3 = PCA(n_components=8)
# Pipe para aplicar pca
pca_pipe3 = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler',scaler3),
            ('pca',pca3)])
# Transformer para cada tipo de columna
preprocessor3 = ColumnTransformer(
    remainder='passthrough',
    transformers=[('drop_col','drop',columns_to_drop),
                  ('pca',pca_pipe3,numerical_col)
#                  ('encoding', OneHotEncoder(),target)
                 ])
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)
ann = KerasClassifier(build_fn=create_model, verbose=0)
pipe_ann = Pipeline(steps=[
    ('preprocessor', preprocessor3),
    ('ann',ann)
])

pipe_ann.fit(train.iloc[:,:-1],train.iloc[:,-1], ann__callbacks=[early_stopping])





Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('drop_col', 'drop',
                                                  ['sensor_15', 'Unnamed: 0',
                                                   'timestamp']),
                                                 ('pca',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(n_components=8))]),
                                                  ['sensor_00', 'sensor_01',
                                                   'sensor_02', 'sen

In [10]:


# X_test = preprocessor3.transform(test.iloc[:,:-1])
# ann_predictions = ann.predict(X_test)
# ann_predictions = pd.DataFrame(ann_predictions)[0].map({0:'NORMAL', 1:'BROKEN'})

ann_predictions = pipe_ann.predict(test.iloc[:,:-1])

ann_mat_confusion = confusion_matrix(
                    y_true    = test.iloc[:,-1],
                    y_pred    = ann_predictions
                )

ann_accuracy = accuracy_score(
            y_true    = test.iloc[:,-1],
            y_pred    = ann_predictions,
            normalize = True
           )

print("Matriz de confusión")
print("-------------------")
print(ann_mat_confusion)
print("")
print(f"El accuracy de test es: {100 * ann_accuracy} %")

ann_TP, ann_FP,ann_FN, ann_TN = get_confusion_matrix_values(
    test.iloc[:,-1],
    ann_predictions)
print(f"True positive: {ann_TP}")
print(f"False positive: {ann_FP}")
print(f"True negative: {ann_TN}")
print(f"False negative: {ann_FN}")

from sklearn.metrics import recall_score
ann_tpr = recall_score(
    test.iloc[:,-1],
    ann_predictions,
    pos_label = 'BROKEN')   # it is better to name it y_test 
# to calculate, tnr we need to set the positive label to the other class
# I assume your negative class consists of 0, if it is -1, change 0 below to that value
ann_tnr = recall_score(
    test.iloc[:,-1],
    ann_predictions,
    pos_label = 'NORMAL') 
ann_fpr = 1 - ann_tnr
ann_fnr = 1 - ann_tpr

print(f"True positive rate: {ann_tpr}")
print(f"True negative rate: {ann_tnr}")
print(f"False positive rate: {ann_fpr}")
print(f"False negative rate: {ann_fnr}")

Matriz de confusión
-------------------
[[ 4229   127]
 [  489 83282]]

El accuracy de test es: 99.30100877143214 %
True positive: 4229
False positive: 127
True negative: 83282
False negative: 489
True positive rate: 0.9708448117539027
True negative rate: 0.9941626577216459
False positive rate: 0.005837342278354085
False negative rate: 0.029155188246097308


In [1]:
#Resultados obtenidos por los modelos
results = pd.DataFrame(columns=['Method','Accuracy','TP','TN','FP','FN','TPR','TNR','FPR','FNR'])
results.iloc[-1] = ['Random Forest',rf_accuracy*100,rf_tp,rf_tn,rf_fp,rf_fn,rf_tpr,rf_tnr,rf_fpr,rf_fnr]
results.iloc[-1] = ['Support Vector Machine Classifier',svc_accuracy*100,svc_tp,svc_tn,svc_fp,svc_fn,svc_tpr,svc_tnr,svc_fpr,svc_fnr]
# results.iloc[-1] = ['ANN',ann_accuracy*100,ann_tp,ann_tn,ann_fp,ann_fn,ann_tpr,ann_tnr,ann_fpr,ann_fnr]

results

NameError: name 'pd' is not defined

In [59]:
# pd.DataFrame(ann_predictions)[0].map({0:'NORMAL', 1:'BROKEN'})
test.iloc[:,-1]
ann_predictions

0        BROKEN
1        BROKEN
2        BROKEN
3        BROKEN
4        BROKEN
          ...  
88123    NORMAL
88124    NORMAL
88125    NORMAL
88126    NORMAL
88127    NORMAL
Name: 0, Length: 88128, dtype: object

## KNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier


scaler3 = StandardScaler()
pca3 = PCA(n_components=8)
# Pipe para aplicar pca
pca_pipe3 = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler',scaler3),
            ('pca',pca3)])
# Transformer para cada tipo de columna
preprocessor3 = ColumnTransformer(
    remainder='passthrough',
    transformers=[('drop_col','drop',columns_to_drop),
                  ('pca',pca_pipe3,numerical_col)
#                  ('encoding', OneHotEncoder(),target)
                 ])

knn = KNeighborsClassifier(
        n_jobs=-1)
pipe_knn = Pipeline(steps=[
    ('preprocessor', preprocessor3),
    ('knn',knn)
])
# Entrenamiento del modelo predictivo
pipe_knn.fit(train.iloc[:,:-1],train.iloc[:,-1])


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('drop_col', 'drop',
                                                  ['sensor_15', 'Unnamed: 0',
                                                   'timestamp']),
                                                 ('pca',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(n_components=8))]),
                                                  ['sensor_00', 'sensor_01',
                                                   'sensor_02', 'sen

## Exportar modelos con joblib

In [11]:
# Exportamos el modelo random forest
joblib.dump(rfc_random.best_estimator_,'../models/randomForest.pkl',compress=1)
# Exportamos el modelo svm
joblib.dump(pipe_svc,'../models/SVC.pkl',compress=1)

['../models/SVC.pkl']

In [7]:
# Exportamos el modelo knn
joblib.dump(pipe_knn,'../models/knn.pkl',compress=1)

['../models/knn.pkl']

In [5]:
joblib.dump(svc_random.best_estimator_,'../models/SVC_cv_grid.pkl',compress=1)

['../models/SVC_cv_grid.pkl']

In [7]:
from keras.models import load_model

# Save the Keras model first:
pipe_ann.named_steps['ann'].model.save('../models/keras_model.h5')
joblib.dump(pipe_ann.named_steps['ann'].classes_, '../models/ann_classes.pkl')

# This hack allows us to save the sklearn pipeline:
pipe_ann.named_steps['ann'] = None

# Finally, save the pipeline:
joblib.dump(pipe_ann, '../models/ann_pipeline.pkl')


TypeError: cannot pickle '_thread.RLock' object

In [11]:
pipe_ann.named_steps['ann'] = None
print(pipe_ann.named_steps)

KeyError: 1