In [26]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
import pandas as pd
from sklearn.utils import resample


In [27]:
archivo = "Datos\\FinaldatasetExtraidoConYSenci.csv"
#archivo = "Datos\\dataset_con_Y.csv"

tabla = pd.read_csv(archivo, parse_dates=["timestamp"], sep=",")
tabla = tabla.dropna()
tabla = tabla.sort_values(["timestamp"]).reset_index(drop=True)


In [28]:
#columnas_excluir = ["timestamp", "host", "Y", "reinicio", "falloBruto", "falloPersistente"]
columnas_excluir = ["timestamp", "host", "Y"]


# Separar clases
clase0 = tabla[tabla["Y"] == 0]
clase1 = tabla[tabla["Y"] == 1]

# Submuestreo de clase 0
clase0_eq = resample(clase0,
                     replace=False,    # no reemplazar
                     n_samples=len(clase1), # mismo tamaño que clase 1
                     random_state=42)

# Dataset balanceado
tabla_eq = pd.concat([clase0_eq, clase1])

#las X serán todas las columnas menos la primera ni última
X = tabla_eq.drop(columns=columnas_excluir)

y = tabla_eq["Y"]

X.shape, y.shape

((230, 23), (230,))

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y)

In [30]:
indices = np.arange(len(y_train))
ind_train, ind_val = train_test_split(indices,test_size=0.33,stratify=y_train)
cv = zip([ind_train],[ind_val])
params = {
    'criterion': ('gini', 'entropy', 'log_loss'),
    'max_features': ('sqrt', 'log2', None),
    'n_estimators': np.arange(10,101,10)
}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, params, cv=cv,n_jobs=-1)
clf.fit(X_train, y_train)

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'criterion': ('gini', ...), 'max_features': ('sqrt', ...), 'n_estimators': array([ 10, ...80, 90, 100])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,<zip object a...0020D9B6C4940>
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(40)
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [31]:
print(f'Los mejores hyperparámetros son: {clf.best_params_}, con exactitud de {clf.best_score_}')

Los mejores hyperparámetros son: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': np.int64(40)}, con exactitud de 1.0


In [32]:
rf_final = RandomForestClassifier(criterion=clf.best_params_['criterion'],
                                  max_features=clf.best_params_['max_features'],
                                  n_estimators=clf.best_params_['n_estimators'])
rf_final.fit(X_train, y_train)

0,1,2
,n_estimators,np.int64(40)
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
#Métricas para evaluar el modelo
# Predicciones
y_pred = rf_final.predict(X_test)
y_proba = rf_final.predict_proba(X_test)[:,1]  # para ROC-AUC

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9855072463768116
Precision: 0.9714285714285714
Recall: 1.0
F1: 0.9855072463768116
ROC-AUC: 0.9970588235294118
Matriz de confusión:
 [[34  1]
 [ 0 34]]


In [34]:
#Segunda forma con Cross validator
rfCV = RandomForestClassifier()
clfCV = GridSearchCV(rfCV, params, n_jobs=-1)
clfCV.fit(X_train, y_train)

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'criterion': ('gini', ...), 'max_features': ('sqrt', ...), 'n_estimators': array([ 10, ...80, 90, 100])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(50)
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
print(f'Los mejores hyperparámetros son: {clfCV.best_params_}, con exactitud de {clfCV.best_score_}')

Los mejores hyperparámetros son: {'criterion': 'entropy', 'max_features': None, 'n_estimators': np.int64(50)}, con exactitud de 0.99375


In [36]:
rf_final_cv = RandomForestClassifier(criterion=clfCV.best_params_['criterion'],
                                  max_features=clfCV.best_params_['max_features'],
                                  n_estimators=clfCV.best_params_['n_estimators'])
rf_final_cv.fit(X_train, y_train)

0,1,2
,n_estimators,np.int64(50)
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [37]:
#Métricas para evaluar el modelo
# Predicciones
y_cv_pred = rf_final_cv.predict(X_test)
y_cv_proba = rf_final_cv.predict_proba(X_test)[:,1]  # para ROC-AUC

# Métricas
print("Accuracy:", accuracy_score(y_test, y_cv_pred))
print("Precision:", precision_score(y_test, y_cv_pred))
print("Recall:", recall_score(y_test, y_cv_pred))
print("F1:", f1_score(y_test, y_cv_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_cv_pred))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_cv_pred))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
ROC-AUC: 1.0
Matriz de confusión:
 [[35  0]
 [ 0 34]]


In [38]:
X

Unnamed: 0,icmp_response_time_mean,icmp_response_time_std,icmp_response_time_min,icmp_response_time_max,jitter_diff_mean,jitter_diff_max,paquetes_unicast_enviados_eth_mean,paquetes_unicast_enviados_eth_std,paquetes_unicast_enviados_eth_min,paquetes_unicast_enviados_eth_max,...,paquetes_unicast_recibidos_eth_max,trafico_lan_recibido_mean,trafico_lan_recibido_std,trafico_lan_recibido_min,trafico_lan_recibido_max,trafico_lan_transmitido_mean,trafico_lan_transmitido_std,trafico_lan_transmitido_min,trafico_lan_transmitido_max,uptime_(network)_max
2777,0.521667,0.470947,0.316000,2.747333,0.832937,2.380667,6.313714e+06,11646.191036,6294201.0,6335198.0,...,25584121.0,66.242098,33.435945,37.609280,179.302584,0.243863,0.034213,0.156800,0.315584,38439.0
2168,0.485289,0.430146,0.344000,2.997000,0.054933,0.126000,4.503393e+07,5818.243641,45023821.0,45043751.0,...,170616774.0,1.998883,0.179195,1.847776,2.305304,0.015259,0.001596,0.012688,0.017008,918874.0
1043,0.492783,0.312712,0.321667,2.291667,0.128000,0.159000,5.231120e+07,5764.088908,52301205.0,52321352.0,...,192682393.0,3.723704,0.447921,3.406976,4.040432,0.019912,0.001278,0.019008,0.020816,855178.0
2550,0.569489,0.699918,0.300333,4.861333,0.375987,4.498000,4.341545e+06,271655.690723,3976215.0,4572646.0,...,11399056.0,287.118936,73.492481,235.151904,339.085968,0.566256,0.001442,0.566256,0.566256,13240.0
1896,0.426772,0.150173,0.328333,1.506333,0.033077,0.076000,4.668099e+07,6721.035880,46669748.0,46692426.0,...,186353586.0,2.403505,0.281923,1.743032,2.939992,0.016614,0.002091,0.010840,0.019672,952402.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4187,29.837447,86.344121,0.000000,581.666667,20.060607,124.672333,2.423126e+07,339031.383864,23606624.0,24797304.0,...,84206971.0,248.543245,151.871989,60.722304,667.664888,0.314194,0.134901,0.116240,0.686184,204039.0
4246,78.070744,153.325463,0.000000,747.333333,67.580087,694.290333,2.545492e+07,381715.515688,24817988.0,26085443.0,...,87372923.0,262.012505,128.330973,79.313200,444.741480,0.356168,0.132360,0.150104,0.571312,207640.0
4252,54.385161,143.952205,0.560000,758.333333,69.924346,734.640000,2.660249e+07,289724.457146,26105143.0,26987250.0,...,89537692.0,167.509556,132.031998,38.709152,450.251448,0.259982,0.105726,0.130760,0.493448,211240.0
4300,1.471728,3.826778,0.546333,25.267667,1.297371,24.685333,2.701305e+07,16467.891349,26987977.0,27045621.0,...,90607273.0,31.386817,25.203160,19.973872,186.959120,0.140225,0.026425,0.106184,0.242480,214840.0


In [39]:
y

2777    0
2168    0
1043    0
2550    0
1896    0
       ..
4187    1
4246    1
4252    1
4300    1
4314    1
Name: Y, Length: 230, dtype: int64