In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
import pandas as pd

In [10]:
archivo = "Datos\\FinaldatasetExtraidoConYSenci.csv"
#archivo = "Datos\\dataset_con_Y.csv"

tabla = pd.read_csv(archivo, parse_dates=["timestamp"], sep=",")
tabla = tabla.dropna()
tabla = tabla.sort_values(["timestamp"]).reset_index(drop=True)


In [11]:
#columnas_excluir = ["timestamp", "host", "Y", "reinicio", "falloBruto", "falloPersistente"]
columnas_excluir = ["timestamp", "Y"]

#las X serán todas las columnas menos la primera ni última
X = tabla.drop(columns=columnas_excluir)

y = tabla["Y"]

X.shape, y.shape

((3853, 24), (3853,))

In [13]:
tabla['Y'].value_counts()

Y
0    3754
1      99
Name: count, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y)

In [6]:
indices = np.arange(len(y_train))
ind_train, ind_val = train_test_split(indices,test_size=0.33,stratify=y_train)
cv = zip([ind_train],[ind_val])
params = {
    'criterion': ('gini', 'entropy', 'log_loss'),
    'max_features': ('sqrt', 'log2', None),
    'n_estimators': np.arange(10,101,10)
}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, params, cv=cv,n_jobs=-1)
clf.fit(X_train, y_train)

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'criterion': ('gini', ...), 'max_features': ('sqrt', ...), 'n_estimators': array([ 10, ...80, 90, 100])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,<zip object a...0025ECFC8B880>
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(10)
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [7]:
print(f'Los mejores hyperparámetros son: {clf.best_params_}, con exactitud de {clf.best_score_}')

Los mejores hyperparámetros son: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': np.int64(10)}, con exactitud de 1.0


In [8]:
rf_final = RandomForestClassifier(criterion=clf.best_params_['criterion'],
                                  max_features=clf.best_params_['max_features'],
                                  n_estimators=clf.best_params_['n_estimators'])
rf_final.fit(X_train, y_train)

0,1,2
,n_estimators,np.int64(10)
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [9]:
#Métricas para evaluar el modelo
# Predicciones
y_pred = rf_final.predict(X_test)
y_proba = rf_final.predict_proba(X_test)[:,1]  # para ROC-AUC

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9992517770295548
Precision: 1.0
Recall: 0.9818181818181818
F1: 0.9908256880733946
ROC-AUC: 1.0
Matriz de confusión:
 [[2563    0]
 [   2  108]]


In [10]:
#Segunda forma con Cross validator
rfCV = RandomForestClassifier()
clfCV = GridSearchCV(rfCV, params, n_jobs=-1)
clfCV.fit(X_train, y_train)

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'criterion': ('gini', ...), 'max_features': ('sqrt', ...), 'n_estimators': array([ 10, ...80, 90, 100])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(100)
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
print(f'Los mejores hyperparámetros son: {clfCV.best_params_}, con exactitud de {clfCV.best_score_}')

Los mejores hyperparámetros son: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': np.int64(100)}, con exactitud de 1.0


In [12]:
rf_final_cv = RandomForestClassifier(criterion=clfCV.best_params_['criterion'],
                                  max_features=clfCV.best_params_['max_features'],
                                  n_estimators=clfCV.best_params_['n_estimators'])
rf_final_cv.fit(X_train, y_train)

0,1,2
,n_estimators,np.int64(100)
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
#Métricas para evaluar el modelo
# Predicciones
y_cv_pred = rf_final_cv.predict(X_test)
y_cv_proba = rf_final_cv.predict_proba(X_test)[:,1]  # para ROC-AUC

# Métricas
print("Accuracy:", accuracy_score(y_test, y_cv_pred))
print("Precision:", precision_score(y_test, y_cv_pred))
print("Recall:", recall_score(y_test, y_cv_pred))
print("F1:", f1_score(y_test, y_cv_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_cv_pred))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_cv_pred))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
ROC-AUC: 1.0
Matriz de confusión:
 [[2563    0]
 [   0  110]]


In [14]:
X

Unnamed: 0,icmp_ping_mean,icmp_ping_std,icmp_ping_min,icmp_ping_max,icmp_response_time_mean,icmp_response_time_std,icmp_response_time_min,icmp_response_time_max,jitter_diff_mean,jitter_diff_max,...,latenciaStdMs,jitterClasicoMeanMs,jitterClasicoMaxMs,tasaErrorEntrada,tasaErrorSalida,traficoTotalMbps,uptimePrev,reinicio,falloBruto,falloPersistente
0,1.0,0.0,1.0,1.0,0.000369,0.000018,0.000347,0.000395,0.000016,0.000048,...,0.018026,0.016167,0.048333,0.0,0.0,2.741368,547818.0,False,False,False
1,1.0,0.0,1.0,1.0,0.000708,0.000738,0.000369,0.002028,0.000669,0.001660,...,0.738322,0.669000,1.659667,0.0,0.0,3.006989,729477.0,False,True,False
2,1.0,0.0,1.0,1.0,0.000392,0.000035,0.000358,0.000450,0.000007,0.000013,...,0.034958,0.007111,0.012667,0.0,0.0,3.062424,729500.0,False,False,False
3,1.0,0.0,1.0,1.0,0.000363,0.000024,0.000321,0.000379,0.000015,0.000045,...,0.023553,0.015133,0.045333,0.0,0.0,3.275843,729475.0,False,False,False
4,1.0,0.0,1.0,1.0,0.000398,0.000047,0.000324,0.000440,0.000039,0.000077,...,0.046618,0.038667,0.077333,0.0,0.0,3.065741,663582.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8905,1.0,0.0,1.0,1.0,0.000396,0.000031,0.000365,0.000443,0.000037,0.000078,...,0.031174,0.037267,0.077667,0.0,0.0,2.110067,843881.0,False,False,False
8906,1.0,0.0,1.0,1.0,0.000379,0.000030,0.000348,0.000421,0.000027,0.000049,...,0.030369,0.027400,0.049000,0.0,0.0,1.865090,843877.0,False,False,False
8907,1.0,0.0,1.0,1.0,0.000414,0.000030,0.000378,0.000446,0.000036,0.000048,...,0.029950,0.035889,0.048000,0.0,0.0,2.010722,909802.0,False,False,False
8908,1.0,0.0,1.0,1.0,0.001231,0.001163,0.000357,0.002508,0.001704,0.002151,...,1.162536,1.703867,2.151333,0.0,0.0,2.127885,843878.0,False,True,False


In [15]:
y

0       0
1       0
2       0
3       0
4       0
       ..
8905    0
8906    0
8907    0
8908    0
8909    1
Name: Y, Length: 8910, dtype: int64