In [41]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
import pandas as pd

In [42]:
csv = "Dataset_tratado444.csv"

datos = pd.read_csv(csv, parse_dates=["timestamp"])
datos = datos.sort_values(["timestamp"]).reset_index(drop=True)
mapa_trafico = {
    "Mala": 0,
    "Buena": 1,
    "Regular": 2
}

# Reemplazar los valores de la columna CaidadSenal por su código numérico
if "CalidadSenal" in datos.columns:
    datos["CalidadSenal"] = datos["CalidadSenal"].map(mapa_trafico).fillna(6).astype(int)
datos

Unnamed: 0,timestamp,ifInOctets_sum,ifInOctets_mean,ifInOctets_max,ifInOctets_std,ifOutOctets_sum,ifOutOctets_mean,ifOutOctets_max,ifOutOctets_std,ifInErrors_sum,...,BW_in_Mbps_max,BW_in_Mbps_std,BW_out_Mbps_mean,BW_out_Mbps_max,BW_out_Mbps_std,TasaErrorIn_mean,TasaErrorIn_max,TasaErrorOut_mean,TasaErrorOut_max,CalidadSenal_dominante
0,2023-08-23 02:10:00,30754899,6150979.8,9931946,3.110673e+06,8192922,1638584.4,2500175,6.356160e+05,23,...,0.26,0.081976,0.044,0.07,0.018166,0.00008,0.0002,0.00012,0.0002,Mala
1,2023-08-23 02:20:00,71351485,7135148.5,9817208,2.211750e+06,36199529,3619952.9,4532418,1.057430e+06,42,...,0.26,0.058271,0.097,0.12,0.030203,0.00008,0.0002,0.00008,0.0002,Regular
2,2023-08-23 02:30:00,64247615,6424761.5,9529870,2.740136e+06,27173516,2717351.6,4367239,1.152618e+06,80,...,0.25,0.073250,0.071,0.12,0.032813,0.00014,0.0004,0.00014,0.0004,Regular
3,2023-08-23 02:40:00,48401514,4840151.4,8404852,2.242085e+06,27088230,2708823.0,4432623,1.051585e+06,127,...,0.22,0.060000,0.072,0.12,0.028206,0.00037,0.0013,0.00019,0.0004,Regular
4,2023-08-23 02:50:00,53376350,5337635.0,9242680,2.391761e+06,28003248,2800324.8,4208201,1.304548e+06,112,...,0.25,0.064429,0.073,0.11,0.035606,0.00035,0.0018,0.00023,0.0009,Regular
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,2023-08-26 12:50:00,67187404,6718740.4,9973326,2.266365e+06,28486632,2848663.2,4904309,1.412130e+06,78,...,0.27,0.061065,0.076,0.13,0.037771,0.00014,0.0003,0.00025,0.0006,Regular
497,2023-08-26 13:00:00,46702587,4670258.7,8568538,2.827584e+06,26965041,2696504.1,4740843,1.509222e+06,119,...,0.23,0.075902,0.074,0.13,0.041150,0.00038,0.0008,0.00033,0.0010,Regular
498,2023-08-26 13:10:00,58606840,5860684.0,8587973,2.234014e+06,32661895,3266189.5,4234341,9.683134e+05,110,...,0.23,0.060406,0.088,0.11,0.024855,0.00027,0.0011,0.00011,0.0004,Mala
499,2023-08-26 13:20:00,51516486,5151648.6,9774821,2.899329e+06,31397520,3139752.0,4861909,1.543966e+06,114,...,0.26,0.078003,0.083,0.13,0.040565,0.00039,0.0013,0.00020,0.0005,Regular


In [43]:
X = datos.drop(columns=["timestamp", "CalidadSenal_dominante"])

y = datos["CalidadSenal_dominante"]

X.shape, y.shape

((501, 38), (501,))

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,stratify=y)

In [45]:
indices = np.arange(len(y_train))
ind_train, ind_val = train_test_split(indices,test_size=0.33,stratify=y_train)
cv = zip([ind_train],[ind_val])
params = {
    'criterion': ('gini', 'entropy', 'log_loss'),
    'max_features': ('sqrt', 'log2', None),
    'n_estimators': np.arange(10,101,10)
}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, params, cv=cv,n_jobs=-1)
clf.fit(X_train, y_train)

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'criterion': ('gini', ...), 'max_features': ('sqrt', ...), 'n_estimators': array([ 10, ...80, 90, 100])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,<zip object a...0012AC0D69A00>
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(50)
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
print(f'Los mejores hyperparámetros son: {clf.best_params_}, con exactitud de {clf.best_score_}')

Los mejores hyperparámetros son: {'criterion': 'gini', 'max_features': None, 'n_estimators': np.int64(50)}, con exactitud de 0.8686868686868687


In [47]:
rf_final = RandomForestClassifier(criterion=clf.best_params_['criterion'],
                                  max_features=clf.best_params_['max_features'],
                                  n_estimators=clf.best_params_['n_estimators'])
rf_final.fit(X_train, y_train)

0,1,2
,n_estimators,np.int64(50)
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [48]:

# Predicciones
y_pred = rf_final.predict(X_test)
y_proba = rf_final.predict_proba(X_test)  # matriz [n_muestras, n_clases]

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall (macro):", recall_score(y_test, y_pred, average="macro"))
print("Recall (weighted):", recall_score(y_test, y_pred, average="weighted"))
print("F1 (macro):", f1_score(y_test, y_pred, average="macro"))
print("F1 (weighted):", f1_score(y_test, y_pred, average="weighted"))
print("ROC-AUC (ovr):", roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro"))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8109452736318408
Recall (macro): 0.5125220458553792
Recall (weighted): 0.8109452736318408
F1 (macro): 0.5181286549707602
F1 (weighted): 0.7989002356637863
ROC-AUC (ovr): 0.813982358668349
Matriz de confusión:
 [[  0   1   2]
 [  0  39  24]
 [  0  11 124]]


In [49]:
X_test

Unnamed: 0,ifInOctets_sum,ifInOctets_mean,ifInOctets_max,ifInOctets_std,ifOutOctets_sum,ifOutOctets_mean,ifOutOctets_max,ifOutOctets_std,ifInErrors_sum,ifInErrors_mean,...,BW_in_Mbps_mean,BW_in_Mbps_max,BW_in_Mbps_std,BW_out_Mbps_mean,BW_out_Mbps_max,BW_out_Mbps_std,TasaErrorIn_mean,TasaErrorIn_max,TasaErrorOut_mean,TasaErrorOut_max
19,61147673,6114767.3,9658144,2.428206e+06,23112948,2311294.8,4225530,1.242635e+06,121,12.1,...,0.163,0.26,0.064987,0.060,0.11,0.033333,0.00025,0.0008,0.00014,0.0004
303,57783007,5778300.7,8990001,2.621877e+06,20472343,2047234.3,4226904,1.138565e+06,90,9.0,...,0.155,0.24,0.069482,0.054,0.11,0.028363,0.00020,0.0005,0.00030,0.0016
342,53491370,5349137.0,9074444,2.721486e+06,24543393,2454339.3,4371907,1.121314e+06,58,5.8,...,0.144,0.24,0.071678,0.065,0.12,0.031358,0.00013,0.0004,0.00023,0.0009
414,45528813,4552881.3,8801402,2.926358e+06,25331399,2533139.9,4264610,1.150633e+06,88,8.8,...,0.120,0.23,0.078599,0.067,0.11,0.029833,0.00024,0.0005,0.00029,0.0007
479,55477701,5547770.1,9594859,2.794443e+06,30432770,3043277.0,4963721,1.432309e+06,100,10.0,...,0.148,0.26,0.075100,0.080,0.13,0.038006,0.00022,0.0006,0.00013,0.0003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,58765781,5876578.1,9950668,2.626177e+06,30022577,3002257.7,4944623,1.407935e+06,93,9.3,...,0.159,0.27,0.070151,0.081,0.13,0.037550,0.00022,0.0008,0.00026,0.0006
386,46442784,4644278.4,9713542,2.830465e+06,26838867,2683886.7,4869246,1.381086e+06,102,10.2,...,0.124,0.26,0.076333,0.071,0.13,0.036652,0.00038,0.0013,0.00029,0.0010
170,58819723,5881972.3,9910606,2.675140e+06,22591256,2259125.6,4783335,1.502076e+06,95,9.5,...,0.157,0.26,0.070719,0.060,0.13,0.040000,0.00023,0.0006,0.00037,0.0010
441,53753465,5375346.5,8110953,2.265173e+06,24186568,2418656.8,4978633,1.411728e+06,63,6.3,...,0.144,0.22,0.062218,0.065,0.13,0.037786,0.00016,0.0007,0.00023,0.0008


In [50]:
#Segunda forma con Cross validator
rfCV = RandomForestClassifier()
clfCV = GridSearchCV(rfCV, params, n_jobs=-1)
clfCV.fit(X_train, y_train)



0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'criterion': ('gini', ...), 'max_features': ('sqrt', ...), 'n_estimators': array([ 10, ...80, 90, 100])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(50)
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [51]:
print(f'Los mejores hyperparámetros son: {clfCV.best_params_}, con exactitud de {clfCV.best_score_}')

Los mejores hyperparámetros son: {'criterion': 'entropy', 'max_features': None, 'n_estimators': np.int64(50)}, con exactitud de 0.8300000000000001


In [52]:
rf_final_cv = RandomForestClassifier(criterion=clfCV.best_params_['criterion'],
                                  max_features=clfCV.best_params_['max_features'],
                                  n_estimators=clfCV.best_params_['n_estimators'])
rf_final_cv.fit(X_train, y_train)

0,1,2
,n_estimators,np.int64(50)
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [53]:

# Predicciones
y_pred = rf_final.predict(X_test)
y_proba = rf_final.predict_proba(X_test)  # matriz [n_muestras, n_clases]

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall (macro):", recall_score(y_test, y_pred, average="macro"))
print("Recall (weighted):", recall_score(y_test, y_pred, average="weighted"))
print("F1 (macro):", f1_score(y_test, y_pred, average="macro"))
print("F1 (weighted):", f1_score(y_test, y_pred, average="weighted"))
print("ROC-AUC (ovr):", roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro"))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8109452736318408
Recall (macro): 0.5125220458553792
Recall (weighted): 0.8109452736318408
F1 (macro): 0.5181286549707602
F1 (weighted): 0.7989002356637863
ROC-AUC (ovr): 0.813982358668349
Matriz de confusión:
 [[  0   1   2]
 [  0  39  24]
 [  0  11 124]]


In [54]:
X

Unnamed: 0,ifInOctets_sum,ifInOctets_mean,ifInOctets_max,ifInOctets_std,ifOutOctets_sum,ifOutOctets_mean,ifOutOctets_max,ifOutOctets_std,ifInErrors_sum,ifInErrors_mean,...,BW_in_Mbps_mean,BW_in_Mbps_max,BW_in_Mbps_std,BW_out_Mbps_mean,BW_out_Mbps_max,BW_out_Mbps_std,TasaErrorIn_mean,TasaErrorIn_max,TasaErrorOut_mean,TasaErrorOut_max
0,30754899,6150979.8,9931946,3.110673e+06,8192922,1638584.4,2500175,6.356160e+05,23,4.6,...,0.162,0.26,0.081976,0.044,0.07,0.018166,0.00008,0.0002,0.00012,0.0002
1,71351485,7135148.5,9817208,2.211750e+06,36199529,3619952.9,4532418,1.057430e+06,42,4.2,...,0.192,0.26,0.058271,0.097,0.12,0.030203,0.00008,0.0002,0.00008,0.0002
2,64247615,6424761.5,9529870,2.740136e+06,27173516,2717351.6,4367239,1.152618e+06,80,8.0,...,0.171,0.25,0.073250,0.071,0.12,0.032813,0.00014,0.0004,0.00014,0.0004
3,48401514,4840151.4,8404852,2.242085e+06,27088230,2708823.0,4432623,1.051585e+06,127,12.7,...,0.130,0.22,0.060000,0.072,0.12,0.028206,0.00037,0.0013,0.00019,0.0004
4,53376350,5337635.0,9242680,2.391761e+06,28003248,2800324.8,4208201,1.304548e+06,112,11.2,...,0.142,0.25,0.064429,0.073,0.11,0.035606,0.00035,0.0018,0.00023,0.0009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,67187404,6718740.4,9973326,2.266365e+06,28486632,2848663.2,4904309,1.412130e+06,78,7.8,...,0.178,0.27,0.061065,0.076,0.13,0.037771,0.00014,0.0003,0.00025,0.0006
497,46702587,4670258.7,8568538,2.827584e+06,26965041,2696504.1,4740843,1.509222e+06,119,11.9,...,0.125,0.23,0.075902,0.074,0.13,0.041150,0.00038,0.0008,0.00033,0.0010
498,58606840,5860684.0,8587973,2.234014e+06,32661895,3266189.5,4234341,9.683134e+05,110,11.0,...,0.156,0.23,0.060406,0.088,0.11,0.024855,0.00027,0.0011,0.00011,0.0004
499,51516486,5151648.6,9774821,2.899329e+06,31397520,3139752.0,4861909,1.543966e+06,114,11.4,...,0.138,0.26,0.078003,0.083,0.13,0.040565,0.00039,0.0013,0.00020,0.0005


In [55]:
y

0         Mala
1      Regular
2      Regular
3      Regular
4      Regular
        ...   
496    Regular
497    Regular
498       Mala
499    Regular
500       Mala
Name: CalidadSenal_dominante, Length: 501, dtype: object