In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Nombres de columnas
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]


In [3]:
# Cargar datos
df_train = pd.read_csv('../data/raw/KDDTrain+.txt', names=column_names, header=None)
df_test = pd.read_csv('../data/raw/KDDTest+.txt', names=column_names, header=None)

print(f"âœ“ Train: {df_train.shape}")
print(f"âœ“ Test: {df_test.shape}")

âœ“ Train: (125973, 43)
âœ“ Test: (22544, 43)


In [4]:
# Simplificar: Normal (0) vs Ataque (1)
df_train['target'] = (df_train['label'] != 'normal').astype(int)
df_test['target'] = (df_test['label'] != 'normal').astype(int)

print("DistribuciÃ³n de la variable objetivo:")
print(df_train['target'].value_counts())
print(f"\nPorcentaje de ataques: {df_train['target'].mean()*100:.2f}%")

DistribuciÃ³n de la variable objetivo:
target
0    67343
1    58630
Name: count, dtype: int64

Porcentaje de ataques: 46.54%


In [5]:
# Eliminar columnas que no usaremos
columns_to_drop = ['label', 'difficulty']
df_train_clean = df_train.drop(columns=columns_to_drop)
df_test_clean = df_test.drop(columns=columns_to_drop)

# Separar features y target
X_train = df_train_clean.drop('target', axis=1)
y_train = df_train_clean['target']

X_test = df_test_clean.drop('target', axis=1)
y_test = df_test_clean['target']

print(f"âœ“ X_train: {X_train.shape}")
print(f"âœ“ y_train: {y_train.shape}")
print(f"âœ“ X_test: {X_test.shape}")
print(f"âœ“ y_test: {y_test.shape}")

âœ“ X_train: (125973, 41)
âœ“ y_train: (125973,)
âœ“ X_test: (22544, 41)
âœ“ y_test: (22544,)


In [6]:
# Identificar tipos de variables
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Variables categÃ³ricas ({len(categorical_cols)}): {categorical_cols}")
print(f"Variables numÃ©ricas ({len(numerical_cols)}): {len(numerical_cols)}")

Variables categÃ³ricas (3): ['protocol_type', 'service', 'flag']
Variables numÃ©ricas (38): 38


In [7]:
# Label Encoding para variables categÃ³ricas
# Importante: Guardar los encoders para usar despuÃ©s en producciÃ³n

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    # Fit en train
    X_train[col] = le.fit_transform(X_train[col])
    # Transform en test (solo las categorÃ­as conocidas)
    X_test[col] = X_test[col].map(lambda x: x if x in le.classes_ else 'unknown')
    
    # Manejar valores desconocidos en test
    if 'unknown' in X_test[col].values:
        le_classes = list(le.classes_)
        le_classes.append('unknown')
        le.classes_ = np.array(le_classes)
    
    X_test[col] = le.transform(X_test[col])
    
    # Guardar encoder
    label_encoders[col] = le
    
    print(f"âœ“ Encoded: {col} - {len(le.classes_)} categorÃ­as")

print(f"\nâœ“ Total encoders guardados: {len(label_encoders)}")

âœ“ Encoded: protocol_type - 3 categorÃ­as
âœ“ Encoded: service - 70 categorÃ­as
âœ“ Encoded: flag - 11 categorÃ­as

âœ“ Total encoders guardados: 3


In [8]:
# StandardScaler: convierte datos a media=0 y std=1
scaler = StandardScaler()

# Fit y transform en train
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

# Solo transform en test (usar parÃ¡metros de train)
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("âœ“ Variables numÃ©ricas normalizadas")
print("\nEjemplo - Primeras 5 filas despuÃ©s de normalizaciÃ³n:")
print(X_train[numerical_cols].head())


âœ“ Variables numÃ©ricas normalizadas

Ejemplo - Primeras 5 filas despuÃ©s de normalizaciÃ³n:
   duration  src_bytes  dst_bytes      land  wrong_fragment    urgent  \
0 -0.110249  -0.007679  -0.004919 -0.014089       -0.089486 -0.007736   
1 -0.110249  -0.007737  -0.004919 -0.014089       -0.089486 -0.007736   
2 -0.110249  -0.007762  -0.004919 -0.014089       -0.089486 -0.007736   
3 -0.110249  -0.007723  -0.002891 -0.014089       -0.089486 -0.007736   
4 -0.110249  -0.007728  -0.004814 -0.014089       -0.089486 -0.007736   

        hot  num_failed_logins  logged_in  num_compromised  ...  \
0 -0.095076          -0.027023  -0.809262        -0.011664  ...   
1 -0.095076          -0.027023  -0.809262        -0.011664  ...   
2 -0.095076          -0.027023  -0.809262        -0.011664  ...   
3 -0.095076          -0.027023   1.235694        -0.011664  ...   
4 -0.095076          -0.027023   1.235694        -0.011664  ...   

   dst_host_count  dst_host_srv_count  dst_host_same_srv_rate  \

In [9]:
# Verificar que no hay valores nulos
print("Valores nulos en train:", X_train.isnull().sum().sum())
print("Valores nulos en test:", X_test.isnull().sum().sum())

# Verificar que todo es numÃ©rico
print("\nTipos de datos en X_train:")
print(X_train.dtypes.value_counts())

# Ver estadÃ­sticas finales
print("\nEstadÃ­sticas de datos procesados:")
print(X_train.describe())

Valores nulos en train: 0
Valores nulos en test: 0

Tipos de datos en X_train:
float64    38
int32       3
Name: count, dtype: int64

EstadÃ­sticas de datos procesados:
           duration  protocol_type        service           flag  \
count  1.259730e+05  125973.000000  125973.000000  125973.000000   
mean   2.549477e-17       1.053202      31.226469       6.979996   
std    1.000004e+00       0.426620      16.346470       2.689365   
min   -1.102492e-01       0.000000       0.000000       0.000000   
25%   -1.102492e-01       1.000000      20.000000       5.000000   
50%   -1.102492e-01       1.000000      24.000000       9.000000   
75%   -1.102492e-01       1.000000      49.000000       9.000000   
max    1.636428e+01       2.000000      69.000000      10.000000   

          src_bytes     dst_bytes          land  wrong_fragment        urgent  \
count  1.259730e+05  1.259730e+05  1.259730e+05    1.259730e+05  1.259730e+05   
mean  -4.512349e-19  7.614590e-19 -4.794371e-18    4.230

In [10]:
import os

# Crear carpeta para datos procesados
os.makedirs('../data/processed', exist_ok=True)

# Guardar datasets procesados
X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

# Guardar scaler y encoders (importante para producciÃ³n)
pickle.dump(scaler, open('../data/processed/scaler.pkl', 'wb'))
pickle.dump(label_encoders, open('../data/processed/label_encoders.pkl', 'wb'))

print("âœ“ Datos procesados guardados en /data/processed/")
print("âœ“ Scaler y encoders guardados para uso futuro")

âœ“ Datos procesados guardados en /data/processed/
âœ“ Scaler y encoders guardados para uso futuro


In [11]:
print("="*70)
print("RESUMEN DEL PREPROCESAMIENTO")
print("="*70)
print(f"\nðŸ“Š Datasets procesados:")
print(f"   Train: {X_train.shape[0]:,} filas Ã— {X_train.shape[1]} features")
print(f"   Test:  {X_test.shape[0]:,} filas Ã— {X_test.shape[1]} features")

print(f"\nðŸŽ¯ Variable objetivo:")
print(f"   Train - Normal: {(y_train==0).sum():,} | Ataques: {(y_train==1).sum():,}")
print(f"   Test  - Normal: {(y_test==0).sum():,} | Ataques: {(y_test==1).sum():,}")

print(f"\nðŸ”§ Transformaciones aplicadas:")
print(f"   âœ“ {len(categorical_cols)} variables categÃ³ricas â†’ Label Encoding")
print(f"   âœ“ {len(numerical_cols)} variables numÃ©ricas â†’ NormalizaciÃ³n (StandardScaler)")
print(f"   âœ“ Variable objetivo binaria (Normal=0, Ataque=1)")

print(f"\nðŸ’¾ Archivos guardados:")
print(f"   âœ“ X_train.csv, y_train.csv")
print(f"   âœ“ X_test.csv, y_test.csv")
print(f"   âœ“ scaler.pkl (para normalizaciÃ³n)")
print(f"   âœ“ label_encoders.pkl (para encoding)")

print("\nðŸš€ LISTO PARA MODELADO!")

RESUMEN DEL PREPROCESAMIENTO

ðŸ“Š Datasets procesados:
   Train: 125,973 filas Ã— 41 features
   Test:  22,544 filas Ã— 41 features

ðŸŽ¯ Variable objetivo:
   Train - Normal: 67,343 | Ataques: 58,630
   Test  - Normal: 9,711 | Ataques: 12,833

ðŸ”§ Transformaciones aplicadas:
   âœ“ 3 variables categÃ³ricas â†’ Label Encoding
   âœ“ 38 variables numÃ©ricas â†’ NormalizaciÃ³n (StandardScaler)
   âœ“ Variable objetivo binaria (Normal=0, Ataque=1)

ðŸ’¾ Archivos guardados:
   âœ“ X_train.csv, y_train.csv
   âœ“ X_test.csv, y_test.csv
   âœ“ scaler.pkl (para normalizaciÃ³n)
   âœ“ label_encoders.pkl (para encoding)

ðŸš€ LISTO PARA MODELADO!
