<a href="https://colab.research.google.com/github/GaboLara998/ProyectoIntegrador/blob/main/ProyectoIntregrador.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Carga de Datos

In [None]:
#Carga de Datasets
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
#Normalizador MinMax
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
# Crea el normalizador
scaler = MinMaxScaler()
# Crea el codificador
encoder = LabelEncoder()

In [None]:
#Funcion de cargar datasets
def cargar_datos(url):
    datos = pd.read_csv(url, encoding='latin1', skipinitialspace=True)
    return datos

# Función para codificar variables categóricas
def codificar_variables_categoricas(datos, columnas_categoricas):
    encoder = LabelEncoder()
    for columna in columnas_categoricas:
        datos[columna] = encoder.fit_transform(datos[columna])
    return datos

# Función para normalizar todas las características numéricas de un DataFrame
def normalizar_caracteristicas(datos):
    scaler = MinMaxScaler()
    columnas_numericas = datos.select_dtypes(include=['float64', 'int64']).columns
    datos[columnas_numericas] = scaler.fit_transform(datos[columnas_numericas])
    return datos

In [None]:
# URLs de los conjuntos de datos
url_training = "https://raw.githubusercontent.com/GaboLara998/ProyectoIntegrador/main/DataSet_UNSW_NB15/UNSW_NB15_training-set.csv?token=GHSAT0AAAAAACMBWAOFBU5CFZ67HIDC2XLOZO7JZMA"
url_testing = "https://raw.githubusercontent.com/GaboLara998/ProyectoIntegrador/main/DataSet_UNSW_NB15/UNSW_NB15_testing-set.csv?token=GHSAT0AAAAAACMBWAOFEFKXDJN42NYN2ZDAZO7JXNA"

# Cargar los datos
datos_training = cargar_datos(url_training)
datos_testing = cargar_datos(url_testing)

# Columnas categóricas
columnas_categoricas = ['proto', 'service', 'state', 'attack_cat']

# Codificar variables categóricas
datos_training = codificar_variables_categoricas(datos_training, columnas_categoricas)
datos_testing = codificar_variables_categoricas(datos_testing, columnas_categoricas)

# Normalizar características
datos_training = normalizar_caracteristicas(datos_training)
datos_testing = normalizar_caracteristicas(datos_testing)


In [None]:
print(datos_testing.describe())
print(datos_testing.info())

             ï»¿id           dur         proto       service         state  \
count  82332.00000  8.233200e+04  82332.000000  82332.000000  82332.000000   
mean       0.50000  1.677927e-02      0.841141      0.118475      0.562459   
std        0.28868  7.850742e-02      0.143364      0.175674      0.111729   
min        0.00000  0.000000e+00      0.000000      0.000000      0.000000   
25%        0.25000  1.333334e-07      0.853846      0.000000      0.500000   
50%        0.50000  2.356334e-04      0.853846      0.000000      0.500000   
75%        0.75000  1.198934e-02      0.900000      0.166667      0.666667   
max        1.00000  1.000000e+00      1.000000      1.000000      1.000000   

              spkts         dpkts        sbytes        dbytes          rate  \
count  82332.000000  82332.000000  82332.000000  82332.000000  82332.000000   
mean       0.001660      0.001592      0.000555      0.000903      0.082411   
std        0.012580      0.010490      0.011956      0.01033

In [None]:
print(datos_training.describe())
print(datos_training.info())

               ï»¿id           dur          proto        service  \
count  175341.000000  1.753410e+05  175341.000000  175341.000000   
mean        0.500000  2.265648e-02       0.830354       0.134910   
std         0.288678  1.080042e-01       0.169338       0.192096   
min         0.000000  0.000000e+00       0.000000       0.000000   
25%         0.250000  1.333334e-07       0.856061       0.000000   
50%         0.500000  2.636667e-05       0.856061       0.000000   
75%         0.750000  1.113449e-02       0.901515       0.166667   
max         1.000000  1.000000e+00       1.000000       1.000000   

               state          spkts          dpkts         sbytes  \
count  175341.000000  175341.000000  175341.000000  175341.000000   
mean        0.294397       0.002007       0.001729       0.000680   
std         0.108493       0.014237       0.010047       0.013480   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.250000       0.000104       

# Implementación de Algoritmos

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def entrenar_autoencoder(features, dimensiones_intermedias=[32, 16], epochs=50, batch_size=256):
    """
    Entrena un autoencoder y devuelve el modelo junto con el encoder.

    Args:
    - features: Características de entrada para entrenar el autoencoder.
    - dimensiones_intermedias: Lista de enteros que define las dimensiones de las capas intermedias.
    - epochs: Número de épocas para entrenar el autoencoder.
    - batch_size: Tamaño del lote para el entrenamiento.

    Returns:
    - autoencoder: Modelo de autoencoder entrenado.
    - encoder: Parte del encoder del autoencoder para la reducción de la dimensionalidad.
    """
    input_dim = features.shape[1]
    input_layer = Input(shape=(input_dim,))
    encoder_layer = input_layer
    for dim in dimensiones_intermedias:
        encoder_layer = Dense(dim, activation="relu")(encoder_layer)
    decoder_layer = Dense(dimensiones_intermedias[-2], activation="relu")(encoder_layer)
    for dim in reversed(dimensiones_intermedias[:-1]):
        decoder_layer = Dense(dim, activation="relu")(decoder_layer)
    decoder_layer = Dense(input_dim, activation="sigmoid")(decoder_layer)

    autoencoder = Model(input_layer, decoder_layer)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    autoencoder.fit(features, features, epochs=epochs, batch_size=batch_size, shuffle=True, validation_split=0.2)

    encoder = Model(input_layer, encoder_layer)
    return autoencoder, encoder

def transformar_con_autoencoder(encoder, features):
    """
    Transforma las características utilizando el encoder de un autoencoder entrenado.

    Args:
    - encoder: El encoder de un autoencoder.
    - features: Características a transformar.

    Returns:
    - Características transformadas.
    """
    return encoder.predict(features)

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

# No considerarar la columna de etiquetas antes de entrenar el autoencoder
features_training = datos_training.drop(columns=['label'], errors='ignore')
features_testing = datos_testing.drop(columns=['label'], errors='ignore')

# Entrenamiento y transformación con Autoencoder
# Asegúrate de convertir DataFrame a numpy array si tu función esperar explícitamente numpy arrays
autoencoder, encoder = entrenar_autoencoder(features_training.to_numpy())
features_training_reduced = transformar_con_autoencoder(encoder, features_training.to_numpy())
features_testing_reduced = transformar_con_autoencoder(encoder, features_testing.to_numpy())

# Entrenamiento de Isolation Forest y One-Class SVM con características reducidas
isolation_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
isolation_forest.fit(features_training_reduced)

one_class_svm = OneClassSVM(kernel='rbf', gamma='auto')
one_class_svm.fit(features_training_reduced)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# Preprocesamiento de Datos

In [None]:
#Isolation Forest
from sklearn.ensemble import IsolationForest

# Instancia del modelo Isolation Forest
isolation_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)

# Entrenamiento del modelo con los datos de entrenamiento (sin incluir la etiqueta)
# Asumiendo que 'label' es la columna de etiquetas y ha sido removida de features_training_normalized
isolation_forest.fit(datos_training)


#OneClassSVM
from sklearn.svm import OneClassSVM

# Instancia del modelo One-Class SVM
one_class_svm = OneClassSVM(kernel='rbf', gamma='auto')

# Entrenamiento del modelo con los datos de entrenamiento
one_class_svm.fit(datos_training)

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize

# Asegurarse de que las etiquetas verdaderas estén en el formato correcto (0 para normal, 1 para anomalía)
labels_testing = datos_testing['label'].copy()
labels_testing_binarized = label_binarize(labels_testing, classes=[0, 1]).ravel()

# Obtener predicciones de los modelos
predictions_isolation_forest = isolation_forest.predict(features_testing_reduced)
predictions_one_class_svm = one_class_svm.predict(features_testing_reduced)

# Convertir las predicciones de -1, 1 a 0, 1
predictions_isolation_forest_binarized = (predictions_isolation_forest < 0).astype(int)
predictions_one_class_svm_binarized = (predictions_one_class_svm < 0).astype(int)


# Precisión, Recall y F1-score para Isolation Forest
precision_if = precision_score(labels_testing_binarized, predictions_isolation_forest_binarized)
recall_if = recall_score(labels_testing_binarized, predictions_isolation_forest_binarized)
f1_score_if = f1_score(labels_testing_binarized, predictions_isolation_forest_binarized)

# Precisión, Recall y F1-score para One-Class SVM
precision_svm = precision_score(labels_testing_binarized, predictions_one_class_svm_binarized)
recall_svm = recall_score(labels_testing_binarized, predictions_one_class_svm_binarized)
f1_score_svm = f1_score(labels_testing_binarized, predictions_one_class_svm_binarized)

print(f"Isolation Forest - Precisión: {precision_if}, Recall: {recall_if}, F1-Score: {f1_score_if}")
print(f"One-Class SVM - Precisión: {precision_svm}, Recall: {recall_svm}, F1-Score: {f1_score_svm}")



Isolation Forest - Precisión: 0.4390050801609817, Recall: 0.1467837289332039, F1-Score: 0.22000694341119872
One-Class SVM - Precisión: 0.5857893427843064, Recall: 0.8313112150357364, F1-Score: 0.6872811496936096


In [None]:
# Revisar balanceo de datos
# Random Forest y normalizacion de datos
# seleccion de mejores caracteristicas
# extraer las caracteristicas
# Comparar efectividad