# Predicci√≥n de Vulnerabilidades en Commits de GitHub

**Descripci√≥n General** <br>
Este laboratorio utiliza t√©cnicas de Machine Learning para analizar commits de GitHub y predecir:
- **Clasificaci√≥n Binaria:** Si un c√≥digo es seguro o vulnerable
- **Clasificaci√≥n Multiclase:** El tipo espec√≠fico de vulnerabilidad

***Para ejecutar el proyecto:***
1. Ejecutar celdas en orden: Ejecutar cada celda secuencialmente
2. Ver resultados intermedios: Cada celda muestra progreso y resultados
3. Modificar par√°metros: Puede ajustar el tama√±o del dataset en Celda 2, funcion ***generate_vulnerability_dataset***
4. Probar con nuevos datos: Use la funci√≥n load_and_predict() con sus propios datos

## Importaci√≥n de Librer√≠as
Importamos todas las librer√≠as necesarias para el proyecto:
- pandas y numpy para manipulaci√≥n de datos
- sklearn para machine learning y preprocesamiento
- xgboost como algoritmo avanzado
- matplotlib y seaborn para visualizaciones

In [None]:
# Importaci√≥n de librer√≠as esenciales
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import random

# Machine Learning
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# XGBoost
import xgboost as xgb

# Utilidades
import joblib
import warnings
warnings.filterwarnings('ignore')

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Generaci√≥n del Dataset Sint√©tico
Esta funci√≥n crea un dataset sint√©tico realista que simula commits de GitHub con:
- Caracter√≠sticas t√©cnicas (l√≠neas de c√≥digo, complejidad, etc.)
- Patrones de seguridad espec√≠ficos por lenguaje
- Historial de desarrolladores
- Etiquetas de vulnerabilidad basadas en OWASP Top 10

In [None]:
def generate_vulnerability_dataset(n_samples=2000):
    """
    Genera un dataset sint√©tico de commits de GitHub con caracter√≠sticas
    de seguridad para clasificaci√≥n de vulnerabilidades
    
    Returns:
    pandas.DataFrame: Dataset con caracter√≠sticas de commits y etiquetas de seguridad
    """
    
    # Configuraci√≥n reproducible
    np.random.seed(42)
    random.seed(42)
    
    # Lenguajes de programaci√≥n comunes en GitHub
    languages = ['JavaScript', 'Python', 'Java', 'C++', 'C', 'PHP', 'Ruby', 'Go', 'Rust']
    
    # Tipos de vulnerabilidad basados en OWASP Top 10
    vulnerability_types = [
        'SQL_Injection', 'XSS', 'Buffer_Overflow', 'Insecure_Authentication',
        'Path_Traversal', 'Command_Injection', 'XXE', 'Deserialization',
        'Cryptographic_Weakness', 'Secure', 'Secure'  # Doble peso para 'Secure'
    ]
    
    # Palabras clave en mensajes de commit por tipo de vulnerabilidad
    vulnerability_keywords = {
        'SQL_Injection': ['sql', 'query', 'database', 'escape', 'parameterize', 'orm'],
        'XSS': ['html', 'script', 'escape', 'sanitize', 'dom', 'innerhtml'],
        'Buffer_Overflow': ['buffer', 'memory', 'alloc', 'size', 'length', 'boundary'],
        'Insecure_Authentication': ['auth', 'password', 'token', 'session', 'cookie', 'jwt'],
        'Path_Traversal': ['path', 'file', 'directory', 'traverse', 'upload'],
        'Command_Injection': ['command', 'exec', 'system', 'shell', 'subprocess'],
        'XXE': ['xml', 'external', 'entity', 'parser'],
        'Deserialization': ['serialize', 'deserialize', 'json', 'yaml', 'pickle'],
        'Cryptographic_Weakness': ['crypto', 'encrypt', 'hash', 'salt', 'iv', 'key'],
        'Secure': ['refactor', 'optimize', 'feature', 'documentation', 'test', 'style']
    }
    
    data = []
    
    for i in range(n_samples):
        # Caracter√≠sticas b√°sicas del commit
        language = random.choice(languages)
        vulnerability_type = random.choice(vulnerability_types)
        
        # Determinar si es vulnerable basado en el tipo
        is_vulnerable = 1 if vulnerability_type != 'Secure' else 0
        
        # Caracter√≠sticas t√©cnicas del commit
        lines_added = np.random.poisson(50)  # N√∫mero de l√≠neas agregadas
        lines_deleted = np.random.poisson(20)  # N√∫mero de l√≠neas eliminadas
        files_changed = np.random.poisson(3)  # Archivos modificados
        
        # M√©tricas de complejidad (dependen del lenguaje)
        complexity_factors = {
            'JavaScript': (15, 8), 'Python': (12, 6), 'Java': (20, 10),
            'C++': (25, 12), 'C': (28, 15), 'PHP': (18, 9),
            'Ruby': (14, 7), 'Go': (16, 8), 'Rust': (22, 11)
        }
        
        base_complexity, complexity_std = complexity_factors[language]
        cyclomatic_complexity = max(1, int(np.random.normal(base_complexity, complexity_std)))
        
        # M√©tricas de seguridad espec√≠ficas por lenguaje
        security_metrics = {
            'JavaScript': {'has_input_validation': 0.3, 'has_escape_functions': 0.4},
            'Python': {'has_input_validation': 0.5, 'has_escape_functions': 0.3},
            'Java': {'has_input_validation': 0.6, 'has_escape_functions': 0.5},
            'C++': {'has_input_validation': 0.4, 'has_escape_functions': 0.2},
            'C': {'has_input_validation': 0.2, 'has_escape_functions': 0.1},
            'PHP': {'has_input_validation': 0.3, 'has_escape_functions': 0.3},
            'Ruby': {'has_input_validation': 0.5, 'has_escape_functions': 0.4},
            'Go': {'has_input_validation': 0.7, 'has_escape_functions': 0.6},
            'Rust': {'has_input_validation': 0.8, 'has_escape_functions': 0.7}
        }
        
        metrics = security_metrics[language]
        has_input_validation = 1 if random.random() < metrics['has_input_validation'] else 0
        has_escape_functions = 1 if random.random() < metrics['has_escape_functions'] else 0
        
        # Patrones de c√≥digo riesgosos (dependen del tipo de vulnerabilidad)
        risky_patterns = {
            'SQL_Injection': {'raw_queries': 0.8, 'string_concatenation': 0.7},
            'XSS': {'inner_html': 0.9, 'eval_usage': 0.6},
            'Buffer_Overflow': {'fixed_size_buffers': 0.8, 'no_bounds_check': 0.9},
            'Insecure_Authentication': {'hardcoded_credentials': 0.7, 'weak_hashing': 0.6},
            'Path_Traversal': {'user_input_paths': 0.8, 'no_path_validation': 0.7},
            'Command_Injection': {'system_calls': 0.9, 'user_input_commands': 0.8},
            'XXE': {'xml_parsing': 0.9, 'external_entities': 0.7},
            'Deserialization': {'untrusted_deserialization': 0.8, 'no_validation': 0.6},
            'Cryptographic_Weakness': {'weak_crypto': 0.7, 'hardcoded_keys': 0.6},
            'Secure': {'raw_queries': 0.1, 'string_concatenation': 0.1}
        }
        
        pattern_weights = risky_patterns[vulnerability_type]
        has_raw_queries = 1 if random.random() < pattern_weights.get('raw_queries', 0.1) else 0
        has_string_concatenation = 1 if random.random() < pattern_weights.get('string_concatenation', 0.1) else 0
        has_inner_html = 1 if random.random() < pattern_weights.get('inner_html', 0.1) else 0
        
        # Historial del desarrollador
        developer_experience = np.random.normal(3, 1)  # A√±os de experiencia
        previous_vulnerabilities = np.random.poisson(0.5) if is_vulnerable else np.random.poisson(0.1)
        
        # Tama√±o del cambio (proxy para riesgo)
        change_size = lines_added + lines_deleted
        risk_factor = min(1.0, change_size / 200)  # Normalizado
        
        # Generar mensaje de commit realista
        commit_message_keywords = random.sample(
            vulnerability_keywords[vulnerability_type], 
            k=min(3, len(vulnerability_keywords[vulnerability_type]))
        )
        commit_message = f"Fix: {', '.join(commit_message_keywords)} issues"
        
        data.append({
            'commit_id': f"commit_{i:06d}",
            'language': language,
            'vulnerability_type': vulnerability_type,
            'is_vulnerable': is_vulnerable,
            'lines_added': lines_added,
            'lines_deleted': lines_deleted,
            'files_changed': files_changed,
            'cyclomatic_complexity': cyclomatic_complexity,
            'has_input_validation': has_input_validation,
            'has_escape_functions': has_escape_functions,
            'has_raw_queries': has_raw_queries,
            'has_string_concatenation': has_string_concatenation,
            'has_inner_html': has_inner_html,
            'developer_experience': max(0.1, developer_experience),
            'previous_vulnerabilities': previous_vulnerabilities,
            'change_size': change_size,
            'risk_factor': risk_factor,
            'commit_message': commit_message,
            'timestamp': datetime.now() - timedelta(days=random.randint(0, 365))
        })
    
    return pd.DataFrame(data)

# Generar el dataset
print("üîÑ Generando dataset de vulnerabilidades...")
df = generate_vulnerability_dataset(2000)
print(f"Dataset generado: {df.shape[0]} filas, {df.shape[1]} columnas")

---
## An√°lisis Exploratorio de Datos (EDA)
Explicaci√≥n: El EDA nos ayuda a:
- Entender la distribuci√≥n de los datos
- Identificar relaciones entre variables
- Detectar posibles problemas de balance en las clases
- Validar la calidad del dataset generado

### An√°lisis Inicial del Dataset

In [None]:
# An√°lisis inicial del dataset
print("AN√ÅLISIS EXPLORATORIO DE DATOS")
print("=" * 50)

print("\nEstructura del Dataset:")
print(f"Dimensiones: {df.shape}")
print(f"\nPrimeras 5 filas:")
display(df.head())

print(f"\nInformaci√≥n de columnas:")
print(df.info())

print(f"\nEstad√≠sticas descriptivas:")
print(df.describe())

print(f"\nDistribuci√≥n de variables clave:")
print(f"Lenguajes: \n{df['language'].value_counts()}")
print(f"\nTipos de vulnerabilidad: \n{df['vulnerability_type'].value_counts()}")
print(f"\nBalance vulnerable/seguro: \n{df['is_vulnerable'].value_counts()}")
print(f"Proporci√≥n vulnerable: {df['is_vulnerable'].mean():.2%}")

### Visualizaciones del Dataset

In [None]:
# Visualizaci√≥n de la distribuci√≥n de datos
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Distribuci√≥n de lenguajes
df['language'].value_counts().plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Distribuci√≥n de Lenguajes de Programaci√≥n')
axes[0,0].tick_params(axis='x', rotation=45)
axes[0,0].set_ylabel('N√∫mero de Commits')

# 2. Distribuci√≥n de tipos de vulnerabilidad
df['vulnerability_type'].value_counts().plot(kind='bar', ax=axes[0,1], color='lightcoral')
axes[0,1].set_title('Distribuci√≥n de Tipos de Vulnerabilidad')
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].set_ylabel('Frecuencia')

# 3. Distribuci√≥n de complejidad ciclom√°tica
df['cyclomatic_complexity'].hist(bins=30, ax=axes[0,2], color='lightgreen', alpha=0.7)
axes[0,2].set_title('Distribuci√≥n de Complejidad Ciclom√°tica')
axes[0,2].set_xlabel('Complejidad Ciclom√°tica')
axes[0,2].set_ylabel('Frecuencia')

# 4. Relaci√≥n entre complejidad y vulnerabilidad
sns.boxplot(data=df, x='is_vulnerable', y='cyclomatic_complexity', ax=axes[1,0])
axes[1,0].set_title('Complejidad vs Vulnerabilidad')
axes[1,0].set_xlabel('Es Vulnerable (0=No, 1=S√≠)')
axes[1,0].set_ylabel('Complejidad Ciclom√°tica')

# 5. Vulnerabilidades por lenguaje
vuln_by_lang = df.groupby('language')['is_vulnerable'].mean().sort_values(ascending=False)
vuln_by_lang.plot(kind='bar', ax=axes[1,1], color='orange')
axes[1,1].set_title('Proporci√≥n de Vulnerabilidades por Lenguaje')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].set_ylabel('Proporci√≥n de Vulnerable')

# 6. Tama√±o de cambio vs vulnerabilidad
sns.scatterplot(data=df, x='change_size', y='risk_factor', hue='is_vulnerable', 
                alpha=0.6, ax=axes[1,2])
axes[1,2].set_title('Tama√±o de Cambio vs Factor de Riesgo')
axes[1,2].set_xlabel('Tama√±o del Cambio (l√≠neas)')
axes[1,2].set_ylabel('Factor de Riesgo')

plt.tight_layout()
plt.show()

# An√°lisis de correlaciones
print("\nüîó Matriz de Correlaciones:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(12, 8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Matriz de Correlaci√≥n de Caracter√≠sticas Num√©ricas')
plt.tight_layout()
plt.show()

---
## Preprocesamiento de Datos
**Preparaci√≥n de Caracter√≠sticas**<br>
Explicaci√≥n: Esta funci√≥n:
- Separa caracter√≠sticas de etiquetas
- Codifica las etiquetas de texto a num√©ricas para modelos de ML
- Define el preprocesador para diferentes tipos de caracter√≠sticas
- Aplica escalado a num√©ricas y one-hot encoding a categ√≥ricas

In [None]:
def prepare_features(df):
    """
    Prepara las caracter√≠sticas para el modelado
    
    Returns:
    tuple: (X, y_binary, y_multiclass, preprocessor, label_encoder)
    """
    print("üîÑ Preparando caracter√≠sticas para modelado...")
    
    # Separar caracter√≠sticas y targets
    X = df.drop(['commit_id', 'vulnerability_type', 'is_vulnerable', 'commit_message', 'timestamp'], axis=1)
    y_binary = df['is_vulnerable']  # Clasificaci√≥n binaria
    y_multiclass = df['vulnerability_type']  # Clasificaci√≥n multiclase
    
    # Codificar etiquetas multiclase a n√∫meros
    label_encoder = LabelEncoder()
    y_multiclass_encoded = label_encoder.fit_transform(y_multiclass)
    
    print("Mapeo de clases de vulnerabilidad:")
    for i, class_name in enumerate(label_encoder.classes_):
        print(f"  {i}: {class_name}")
    
    # Definir transformadores
    numeric_features = [
        'lines_added', 'lines_deleted', 'files_changed', 'cyclomatic_complexity',
        'developer_experience', 'previous_vulnerabilities', 'change_size', 'risk_factor'
    ]
    
    categorical_features = ['language']
    
    binary_features = [
        'has_input_validation', 'has_escape_functions', 'has_raw_queries',
        'has_string_concatenation', 'has_inner_html'
    ]
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),
            ('bin', 'passthrough', binary_features)
        ]
    )
    
    print("Caracter√≠sticas preparadas exitosamente")
    return X, y_binary, y_multiclass_encoded, preprocessor, label_encoder

# Preparar los datos
X, y_binary, y_multiclass_encoded, preprocessor, label_encoder = prepare_features(df)

**Divisi√≥n de Datos**

In [None]:
# Dividir en conjuntos de entrenamiento y prueba
print("üîÑ Dividiendo datos en train y test...")

# Para clasificaci√≥n binaria
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

# Para clasificaci√≥n multiclase
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X, y_multiclass_encoded, test_size=0.2, random_state=42, stratify=y_multiclass_encoded
)

print("‚úÖ Datos divididos exitosamente:")
print(f"üìä Conjunto de entrenamiento binario: {X_train_bin.shape}")
print(f"üìä Conjunto de prueba binario: {X_test_bin.shape}")
print(f"üéØ Conjunto de entrenamiento multiclase: {X_train_multi.shape}")
print(f"üéØ Conjunto de prueba multiclase: {X_test_multi.shape}")

# Verificar balance de clases
print(f"\n‚öñÔ∏è Balance en entrenamiento binario:")
print(pd.Series(y_train_bin).value_counts(normalize=True))
print(f"\n‚öñÔ∏è Balance en entrenamiento multiclase:")
unique, counts = np.unique(y_train_multi, return_counts=True)
for cls, count in zip(unique, counts):
    class_name = label_encoder.inverse_transform([cls])[0]
    print(f"  {class_name}: {count/len(y_train_multi):.2%}")

---
### Entrenamiento de Modelos
Explicaci√≥n: Entrenamos m√∫ltiples algoritmos para:
- Binario: Random Forest, Logistic Regression, SVM, XGBoost
- Multiclase: Random Forest y XGBoost. <br>
  
Cada modelo se eval√∫a con m√©tricas est√°ndar de clasificaci√≥n.
#### Clasificaci√≥n Binaria (Seguro/Vulnerable)

In [None]:
# Modelos para clasificaci√≥n binaria
print("üîß Entrenando modelos de clasificaci√≥n binaria...")

binary_models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')
}

binary_results = {}

for name, model in binary_models.items():
    print(f"üîÑ Entrenando {name}...")
    
    # Crear pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Entrenar modelo
    pipeline.fit(X_train_bin, y_train_bin)
    
    # Predecir
    y_pred = pipeline.predict(X_test_bin)
    y_pred_proba = pipeline.predict_proba(X_test_bin)[:, 1]
    
    # Evaluar
    accuracy = accuracy_score(y_test_bin, y_pred)
    binary_results[name] = {
        'model': pipeline,
        'accuracy': accuracy,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"‚úÖ {name} - Accuracy: {accuracy:.4f}")
    print(classification_report(y_test_bin, y_pred))
    print("‚îÄ" * 50)

# Mostrar comparaci√≥n de modelos binarios
print("\nüèÜ COMPARACI√ìN DE MODELOS BINARIOS:")
binary_comparison = pd.DataFrame({
    'Modelo': list(binary_results.keys()),
    'Accuracy': [result['accuracy'] for result in binary_results.values()]
}).sort_values('Accuracy', ascending=False)

print(binary_comparison)

#### Clasificaci√≥n Multiclase (Tipo de Vulnerabilidad)

In [None]:
# Modelos para clasificaci√≥n multiclase
print("üîß Entrenando modelos de clasificaci√≥n multiclase...")

multi_models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')
}

multi_results = {}

for name, model in multi_models.items():
    print(f"üîÑ Entrenando {name}...")
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Entrenar modelo
    pipeline.fit(X_train_multi, y_train_multi)
    
    # Predecir
    y_pred = pipeline.predict(X_test_multi)
    y_pred_proba = pipeline.predict_proba(X_test_multi)
    
    # Evaluar
    accuracy = accuracy_score(y_test_multi, y_pred)
    multi_results[name] = {
        'model': pipeline,
        'accuracy': accuracy,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'label_encoder': label_encoder
    }
    
    print(f"‚úÖ {name} - Accuracy: {accuracy:.4f}")
    
    # Convertir n√∫meros de vuelta a nombres para el reporte
    y_test_names = label_encoder.inverse_transform(y_test_multi)
    y_pred_names = label_encoder.inverse_transform(y_pred)
    
    print(classification_report(y_test_names, y_pred_names))
    print("‚îÄ" * 50)

# Mostrar comparaci√≥n de modelos multiclase
print("\nüèÜ COMPARACI√ìN DE MODELOS MULTICLASE:")
multi_comparison = pd.DataFrame({
    'Modelo': list(multi_results.keys()),
    'Accuracy': [result['accuracy'] for result in multi_results.values()]
}).sort_values('Accuracy', ascending=False)

print(multi_comparison)

---
## Evaluaci√≥n y Visualizaci√≥n
#### Matrices de Confusi√≥n

In [None]:
# Correcci√≥n de matrices de confusi√≥n
print("üìà Visualizando matrices de confusi√≥n...")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Matriz de confusi√≥n para el mejor modelo binario
best_binary_name = binary_comparison.iloc[0]['Modelo']
best_binary_result = binary_results[best_binary_name]

# Calcular matriz de confusi√≥n binaria
cm_binary = confusion_matrix(y_test_bin, best_binary_result['predictions'])

# Visualizar matriz binaria
sns.heatmap(cm_binary, annot=True, fmt='d', cmap='Blues', ax=axes[0,0],
            xticklabels=['Seguro (0)', 'Vulnerable (1)'],
            yticklabels=['Seguro (0)', 'Vulnerable (1)'])
axes[0,0].set_title(f'Matriz de Confusi√≥n - {best_binary_name} (Binario)')
axes[0,0].set_xlabel('Predicci√≥n')
axes[0,0].set_ylabel('Real')

# 2. Matriz de confusi√≥n para el mejor modelo multiclase
best_multi_name = multi_comparison.iloc[0]['Modelo']
best_multi_result = multi_results[best_multi_name]

# Convertir etiquetas num√©ricas a nombres para la matriz
y_test_multi_names = label_encoder.inverse_transform(y_test_multi)
y_pred_multi_names = label_encoder.inverse_transform(best_multi_result['predictions'])
class_names = label_encoder.classes_

# Calcular matriz de confusi√≥n multiclase
cm_multi = confusion_matrix(y_test_multi_names, y_pred_multi_names, labels=class_names)

# Visualizar matriz multiclase
sns.heatmap(cm_multi, annot=True, fmt='d', cmap='Blues', ax=axes[0,1],
            xticklabels=class_names, 
            yticklabels=class_names)
axes[0,1].set_title(f'Matriz de Confusi√≥n - {best_multi_name} (Multiclase)')
axes[0,1].set_xlabel('Predicci√≥n')
axes[0,1].set_ylabel('Real')
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].tick_params(axis='y', rotation=0)

# 3. Comparaci√≥n de accuracy entre modelos
models_combined = {
    **{f"Binario - {k}": v['accuracy'] for k, v in binary_results.items()},
    **{f"Multiclase - {k}": v['accuracy'] for k, v in multi_results.items()}
}

models_df = pd.DataFrame({
    'Modelo': list(models_combined.keys()),
    'Accuracy': list(models_combined.values())
}).sort_values('Accuracy', ascending=False)

sns.barplot(data=models_df, x='Accuracy', y='Modelo', ax=axes[1,0], palette='viridis')
axes[1,0].set_title('Comparaci√≥n de Accuracy entre Modelos')
axes[1,0].set_xlim(0, 1)

# 4. Importancia de caracter√≠sticas - CORREGIDO
try:
    best_model = binary_results[best_binary_name]['model']
    
    # Obtener nombres de caracter√≠sticas del preprocesador
    feature_names = []
    
    # Caracter√≠sticas num√©ricas
    numeric_features = [
        'lines_added', 'lines_deleted', 'files_changed', 'cyclomatic_complexity',
        'developer_experience', 'previous_vulnerabilities', 'change_size', 'risk_factor'
    ]
    feature_names.extend(numeric_features)
    
    # Caracter√≠sticas categ√≥ricas (one-hot encoded)
    if hasattr(preprocessor.named_transformers_['cat'], 'get_feature_names_out'):
        cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(['language'])
        feature_names.extend(cat_features)
    else:
        # Fallback: nombres gen√©ricos para caracter√≠sticas categ√≥ricas
        unique_langs = X['language'].unique()
        feature_names.extend([f'language_{lang}' for lang in unique_langs[1:]])  # Excluir primera categor√≠a (drop='first')
    
    # Caracter√≠sticas binarias
    binary_features = [
        'has_input_validation', 'has_escape_functions', 'has_raw_queries',
        'has_string_concatenation', 'has_inner_html'
    ]
    feature_names.extend(binary_features)
    
    # Obtener importancias del modelo
    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        importances = best_model.named_steps['classifier'].feature_importances_
        
        # Asegurarse de que tenemos el mismo n√∫mero de caracter√≠sticas
        min_length = min(len(importances), len(feature_names))
        importances = importances[:min_length]
        feature_names_used = feature_names[:min_length]
        
        feature_importance_df = pd.DataFrame({
            'feature': feature_names_used,
            'importance': importances
        }).sort_values('importance', ascending=False).head(10)
        
        sns.barplot(data=feature_importance_df, x='importance', y='feature', ax=axes[1,1], palette='rocket')
        axes[1,1].set_title('Top 10 Caracter√≠sticas M√°s Importantes')
    else:
        axes[1,1].text(0.5, 0.5, 'Importancias no disponibles\npara este modelo', 
                      ha='center', va='center', transform=axes[1,1].transAxes)
        axes[1,1].set_title('Importancia de Caracter√≠sticas')

except Exception as e:
    print(f"‚ö†Ô∏è Error al generar importancia de caracter√≠sticas: {e}")
    axes[1,1].text(0.5, 0.5, f'Error: {str(e)}', 
                  ha='center', va='center', transform=axes[1,1].transAxes)
    axes[1,1].set_title('Importancia de Caracter√≠sticas')

plt.tight_layout()
plt.show()

print("‚úÖ Visualizaciones completadas")

#### An√°lisis de Caracter√≠sticas Importantes

In [None]:
# AN√ÅLISIS DE CARACTER√çSTICAS IMPORTANTES - VERSI√ìN CORREGIDA
print("üîç AN√ÅLISIS DE CARACTER√çSTICAS IMPORTANTES")
print("=" * 50)

def get_feature_importance_analysis(model, preprocessor, feature_names, top_n=15):
    """
    Funci√≥n robusta para analizar la importancia de caracter√≠sticas
    
    Args:
        model: Modelo entrenado con pipeline
        preprocessor: Preprocesador usado
        feature_names: Lista de nombres originales de caracter√≠sticas
        top_n: N√∫mero top de caracter√≠sticas a mostrar
    
    Returns:
        DataFrame con importancia de caracter√≠sticas
    """
    try:
        # Verificar si el modelo tiene importancias
        if not hasattr(model.named_steps['classifier'], 'feature_importances_'):
            print("‚ö†Ô∏è Este modelo no tiene importancias de caracter√≠sticas")
            return None
        
        # Obtener importancias
        importances = model.named_steps['classifier'].feature_importances_
        
        # Construir nombres de caracter√≠sticas del preprocesador
        final_feature_names = []
        
        # 1. Caracter√≠sticas num√©ricas (mantienen sus nombres)
        numeric_features = [
            'lines_added', 'lines_deleted', 'files_changed', 'cyclomatic_complexity',
            'developer_experience', 'previous_vulnerabilities', 'change_size', 'risk_factor'
        ]
        final_feature_names.extend(numeric_features)
        
        # 2. Caracter√≠sticas categ√≥ricas (OneHot Encoding)
        categorical_transformer = preprocessor.named_transformers_['cat']
        if hasattr(categorical_transformer, 'get_feature_names_out'):
            cat_features = categorical_transformer.get_feature_names_out(['language'])
            final_feature_names.extend(cat_features)
        else:
            # Fallback manual
            unique_langs = ['JavaScript', 'Python', 'Java', 'C++', 'C', 'PHP', 'Ruby', 'Go', 'Rust']
            # Excluir la primera categor√≠a (due to drop='first')
            final_feature_names.extend([f'language_{lang}' for lang in unique_langs[1:]])
        
        # 3. Caracter√≠sticas binarias
        binary_features = [
            'has_input_validation', 'has_escape_functions', 'has_raw_queries',
            'has_string_concatenation', 'has_inner_html'
        ]
        final_feature_names.extend(binary_features)
        
        # Asegurar que tenemos la misma cantidad de caracter√≠sticas
        min_length = min(len(importances), len(final_feature_names))
        importances = importances[:min_length]
        final_feature_names = final_feature_names[:min_length]
        
        # Crear DataFrame
        importance_df = pd.DataFrame({
            'feature': final_feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False).head(top_n)
        
        return importance_df
        
    except Exception as e:
        print(f"‚ùå Error en an√°lisis de caracter√≠sticas: {e}")
        return None

# Aplicar a todos los modelos que tengan importancias
print("üìä IMPORTANCIA DE CARACTER√çSTICAS POR MODELO")

# Para modelos binarios
print("\nüîí MODELOS BINARIOS:")
binary_importance_results = {}

for model_name, result in binary_results.items():
    print(f"\nüîπ Analizando {model_name}...")
    
    importance_df = get_feature_importance_analysis(
        result['model'], 
        preprocessor,
        list(X.columns),
        top_n=10
    )
    
    if importance_df is not None:
        binary_importance_results[model_name] = importance_df
        print(f"‚úÖ Top 5 caracter√≠sticas m√°s importantes:")
        for idx, row in importance_df.head().iterrows():
            print(f"   {row['feature']}: {row['importance']:.4f}")
    else:
        print(f"‚ö†Ô∏è No se pudieron obtener importancias para {model_name}")

# Para modelos multiclase
print("\nüéØ MODELOS MULTICLASE:")
multi_importance_results = {}

for model_name, result in multi_results.items():
    print(f"\nüîπ Analizando {model_name}...")
    
    importance_df = get_feature_importance_analysis(
        result['model'], 
        preprocessor,
        list(X.columns),
        top_n=10
    )
    
    if importance_df is not None:
        multi_importance_results[model_name] = importance_df
        print(f"‚úÖ Top 5 caracter√≠sticas m√°s importantes:")
        for idx, row in importance_df.head().iterrows():
            print(f"   {row['feature']}: {row['importance']:.4f}")
    else:
        print(f"‚ö†Ô∏è No se pudieron obtener importancias para {model_name}")

# Visualizaci√≥n de importancias
print("\nüìà VISUALIZANDO IMPORTANCIAS...")

# Seleccionar el mejor modelo para visualizaci√≥n detallada
if binary_importance_results:
    best_binary_model_name = max(binary_results.items(), 
                                key=lambda x: x[1]['accuracy'])[0]
    
    if best_binary_model_name in binary_importance_results:
        best_importance_df = binary_importance_results[best_binary_model_name]
        
        # Crear visualizaci√≥n
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        # Gr√°fico de barras horizontal
        sns.barplot(data=best_importance_df, x='importance', y='feature', 
                   ax=ax1, palette='viridis')
        ax1.set_title(f'Importancia de Caracter√≠sticas - {best_binary_model_name}\n(Clasificaci√≥n Binaria)')
        ax1.set_xlabel('Importancia')
        ax1.set_ylabel('Caracter√≠stica')
        
        # Gr√°fico de pie para top 5
        top_5 = best_importance_df.head(5)
        colors = plt.cm.Set3(np.linspace(0, 1, len(top_5)))
        ax2.pie(top_5['importance'], labels=top_5['feature'], autopct='%1.1f%%',
               colors=colors, startangle=90)
        ax2.set_title('Distribuci√≥n de Importancia\n(Top 5 Caracter√≠sticas)')
        
        plt.tight_layout()
        plt.show()
        
        # An√°lisis detallado
        print(f"\nüìã AN√ÅLISIS DETALLADO - {best_binary_model_name}:")
        print("=" * 50)
        
        for idx, row in best_importance_df.iterrows():
            importance_percent = (row['importance'] / best_importance_df['importance'].sum()) * 100
            print(f"#{idx+1:2d} {row['feature']:30} {row['importance']:.4f} ({importance_percent:5.1f}%)")
            
    else:
        print("‚ö†Ô∏è No hay datos de importancia para el mejor modelo binario")
else:
    print("‚ö†Ô∏è No hay resultados de importancia disponibles")

In [None]:
# AN√ÅLISIS POR TIPO DE CARACTER√çSTICA
print("üéØ AN√ÅLISIS POR TIPO DE CARACTER√çSTICA")
print("=" * 40)

def analyze_feature_categories(importance_df):
    """Analiza las caracter√≠sticas por categor√≠as"""
    if importance_df is None:
        return
    
    # Definir categor√≠as
    categories = {
        'Complejidad y Tama√±o': ['cyclomatic_complexity', 'change_size', 'files_changed', 
                                'lines_added', 'lines_deleted'],
        'Historial Desarrollador': ['developer_experience', 'previous_vulnerabilities', 'risk_factor'],
        'Pr√°cticas Seguras': ['has_input_validation', 'has_escape_functions'],
        'Patrones Riesgosos': ['has_raw_queries', 'has_string_concatenation', 'has_inner_html'],
        'Lenguaje': [col for col in importance_df['feature'] if col.startswith('language_')]
    }
    
    category_importance = {}
    
    for category, features in categories.items():
        category_features = importance_df[importance_df['feature'].isin(features)]
        total_importance = category_features['importance'].sum()
        category_importance[category] = {
            'total_importance': total_importance,
            'feature_count': len(category_features),
            'features': category_features.to_dict('records')
        }
    
    return category_importance

# Aplicar an√°lisis al mejor modelo
if binary_importance_results and best_binary_model_name in binary_importance_results:
    best_importance_df = binary_importance_results[best_binary_model_name]
    category_analysis = analyze_feature_categories(best_importance_df)
    
    if category_analysis:
        print(f"\nüìä DISTRIBUCI√ìN POR CATEGOR√çAS - {best_binary_model_name}:")
        
        # Calcular porcentajes
        total_importance = best_importance_df['importance'].sum()
        
        for category, data in category_analysis.items():
            percentage = (data['total_importance'] / total_importance) * 100
            print(f"\nüîπ {category}:")
            print(f"   Importancia total: {data['total_importance']:.4f} ({percentage:.1f}%)")
            print(f"   N√∫mero de caracter√≠sticas: {data['feature_count']}")
            
            if data['features']:
                print("   Caracter√≠sticas incluidas:")
                for feature in data['features']:
                    feat_percentage = (feature['importance'] / total_importance) * 100
                    print(f"     - {feature['feature']}: {feature['importance']:.4f} ({feat_percentage:.1f}%)")

# Visualizaci√≥n de categor√≠as
if binary_importance_results and best_binary_model_name in binary_importance_results:
    category_analysis = analyze_feature_categories(best_importance_df)
    
    if category_analysis:
        # Preparar datos para visualizaci√≥n
        categories = list(category_analysis.keys())
        importances = [data['total_importance'] for data in category_analysis.values()]
        percentages = [(imp / total_importance) * 100 for imp in importances]
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Gr√°fico de barras
        bars = ax1.bar(categories, importances, color=plt.cm.Pastel1(range(len(categories))))
        ax1.set_title('Importancia por Categor√≠a de Caracter√≠stica')
        ax1.set_ylabel('Importancia Total')
        ax1.tick_params(axis='x', rotation=45)
        
        # A√±adir valores en las barras
        for bar, percentage in zip(bars, percentages):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{percentage:.1f}%', ha='center', va='bottom')
        
        # Gr√°fico de pie
        ax2.pie(importances, labels=categories, autopct='%1.1f%%', startangle=90,
               colors=plt.cm.Set3(np.linspace(0, 1, len(categories))))
        ax2.set_title('Distribuci√≥n de Importancia por Categor√≠a')
        
        plt.tight_layout()
        plt.show()

In [None]:
# AN√ÅLISIS POR TIPO DE CARACTER√çSTICA
print("üéØ AN√ÅLISIS POR TIPO DE CARACTER√çSTICA")
print("=" * 40)

def analyze_feature_categories(importance_df):
    """Analiza las caracter√≠sticas por categor√≠as"""
    if importance_df is None:
        return
    
    # Definir categor√≠as
    categories = {
        'Complejidad y Tama√±o': ['cyclomatic_complexity', 'change_size', 'files_changed', 
                                'lines_added', 'lines_deleted'],
        'Historial Desarrollador': ['developer_experience', 'previous_vulnerabilities', 'risk_factor'],
        'Pr√°cticas Seguras': ['has_input_validation', 'has_escape_functions'],
        'Patrones Riesgosos': ['has_raw_queries', 'has_string_concatenation', 'has_inner_html'],
        'Lenguaje': [col for col in importance_df['feature'] if col.startswith('language_')]
    }
    
    category_importance = {}
    
    for category, features in categories.items():
        category_features = importance_df[importance_df['feature'].isin(features)]
        total_importance = category_features['importance'].sum()
        category_importance[category] = {
            'total_importance': total_importance,
            'feature_count': len(category_features),
            'features': category_features.to_dict('records')
        }
    
    return category_importance

# Aplicar an√°lisis al mejor modelo
if binary_importance_results and best_binary_model_name in binary_importance_results:
    best_importance_df = binary_importance_results[best_binary_model_name]
    category_analysis = analyze_feature_categories(best_importance_df)
    
    if category_analysis:
        print(f"\nüìä DISTRIBUCI√ìN POR CATEGOR√çAS - {best_binary_model_name}:")
        
        # Calcular porcentajes
        total_importance = best_importance_df['importance'].sum()
        
        for category, data in category_analysis.items():
            percentage = (data['total_importance'] / total_importance) * 100
            print(f"\nüîπ {category}:")
            print(f"   Importancia total: {data['total_importance']:.4f} ({percentage:.1f}%)")
            print(f"   N√∫mero de caracter√≠sticas: {data['feature_count']}")
            
            if data['features']:
                print("   Caracter√≠sticas incluidas:")
                for feature in data['features']:
                    feat_percentage = (feature['importance'] / total_importance) * 100
                    print(f"     - {feature['feature']}: {feature['importance']:.4f} ({feat_percentage:.1f}%)")

# Visualizaci√≥n de categor√≠as
if binary_importance_results and best_binary_model_name in binary_importance_results:
    category_analysis = analyze_feature_categories(best_importance_df)
    
    if category_analysis:
        # Preparar datos para visualizaci√≥n
        categories = list(category_analysis.keys())
        importances = [data['total_importance'] for data in category_analysis.values()]
        percentages = [(imp / total_importance) * 100 for imp in importances]
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Gr√°fico de barras
        bars = ax1.bar(categories, importances, color=plt.cm.Pastel1(range(len(categories))))
        ax1.set_title('Importancia por Categor√≠a de Caracter√≠stica')
        ax1.set_ylabel('Importancia Total')
        ax1.tick_params(axis='x', rotation=45)
        
        # A√±adir valores en las barras
        for bar, percentage in zip(bars, percentages):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{percentage:.1f}%', ha='center', va='bottom')
        
        # Gr√°fico de pie
        ax2.pie(importances, labels=categories, autopct='%1.1f%%', startangle=90,
               colors=plt.cm.Set3(np.linspace(0, 1, len(categories))))
        ax2.set_title('Distribuci√≥n de Importancia por Categor√≠a')
        
        plt.tight_layout()
        plt.show()

---
# Sistema de Predicci√≥n
#### Funci√≥n de Predicci√≥n para Nuevos Commits

In [None]:
def predict_commit_security(binary_model, multi_model, label_encoder, commit_data):
    """
    Predice seguridad y tipo de vulnerabilidad para un nuevo commit
    
    Args:
        binary_model: Modelo entrenado para clasificaci√≥n binaria
        multi_model: Modelo entrenado para clasificaci√≥n multiclase  
        label_encoder: Encoder para etiquetas multiclase
        commit_data: Diccionario con caracter√≠sticas del commit
    
    Returns:
        dict: Resultados de la predicci√≥n con probabilidades
    """
    # Crear DataFrame
    new_commit_df = pd.DataFrame([commit_data])
    
    # Predicci√≥n binaria
    is_vulnerable_pred = binary_model.predict(new_commit_df)[0]
    vulnerability_prob = binary_model.predict_proba(new_commit_df)[0, 1]
    
    # Predicci√≥n multiclase
    vulnerability_type_encoded = multi_model.predict(new_commit_df)[0]
    vulnerability_type_proba = multi_model.predict_proba(new_commit_df)[0]
    vulnerability_type_name = label_encoder.inverse_transform([vulnerability_type_encoded])[0]
    
    # Obtener la probabilidad de la clase predicha
    max_prob = vulnerability_type_proba[vulnerability_type_encoded]
    
    result = {
        'is_vulnerable': bool(is_vulnerable_pred),
        'vulnerability_probability': float(vulnerability_prob),
        'vulnerability_type': vulnerability_type_name,
        'vulnerability_type_confidence': float(max_prob),
        'risk_level': 'ALTO' if vulnerability_prob > 0.7 else 
                     'MEDIO' if vulnerability_prob > 0.3 else 'BAJO',
        'all_probabilities': {
            label_encoder.inverse_transform([i])[0]: float(prob) 
            for i, prob in enumerate(vulnerability_type_proba)
        }
    }
    
    return result

# Ejemplo de uso
print("üéØ Ejemplo de predicci√≥n para nuevo commit:")

example_commit = {
    'language': 'Python',
    'lines_added': 45,
    'lines_deleted': 12,
    'files_changed': 2,
    'cyclomatic_complexity': 18,
    'has_input_validation': 0,
    'has_escape_functions': 0,
    'has_raw_queries': 1,
    'has_string_concatenation': 1,
    'has_inner_html': 0,
    'developer_experience': 1.5,
    'previous_vulnerabilities': 2,
    'change_size': 57,
    'risk_factor': 0.4
}

# Obtener los mejores modelos
best_binary_model = binary_results[best_binary_name]['model']
best_multi_model = multi_results[best_multi_name]['model']
best_label_encoder = multi_results[best_multi_name]['label_encoder']

prediction = predict_commit_security(
    best_binary_model, 
    best_multi_model, 
    best_label_encoder, 
    example_commit
)

print("üîç Predicci√≥n Completa para Nuevo Commit:")
print(f"  ‚ö†Ô∏è  Es vulnerable: {prediction['is_vulnerable']}")
print(f"  üìä Probabilidad de vulnerabilidad: {prediction['vulnerability_probability']:.2%}")
print(f"  üéØ Tipo de vulnerabilidad: {prediction['vulnerability_type']}")
print(f"  ‚úÖ Confianza en el tipo: {prediction['vulnerability_type_confidence']:.2%}")
print(f"  üö¶ Nivel de riesgo: {prediction['risk_level']}")

print("\nüìà Probabilidades por tipo de vulnerabilidad:")
for vuln_type, prob in sorted(prediction['all_probabilities'].items(), 
                             key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {vuln_type}: {prob:.2%}")

# Guardado de Modelos y Resultados
#### Persistencia de Modelos

In [None]:
# üíæ GUARDADO DE MODELOS Y RESULTADOS - VERSI√ìN CORREGIDA
print("üíæ Guardando modelos y resultados...")

import os
# Crear directorio si no existe
os.makedirs('models', exist_ok=True)

# Guardar dataset
df.to_csv('models/github_commits_vulnerability_dataset.csv', index=False)

# Guardar modelos binarios
for name, result in binary_results.items():
    filename = f'models/binary_{name.lower().replace(" ", "_")}.pkl'
    joblib.dump(result['model'], filename)

# Guardar modelos multiclase
for name, result in multi_results.items():
    model_data = {
        'model': result['model'],
        'label_encoder': result['label_encoder']
    }
    filename = f'models/multi_{name.lower().replace(" ", "_")}.pkl'
    joblib.dump(model_data, filename)

# Guardar preprocessor y label encoder
joblib.dump(preprocessor, 'models/feature_preprocessor.pkl')
joblib.dump(label_encoder, 'models/label_encoder.pkl')

# PREPARAR RESULTADOS DE EVALUACI√ìN DE MANERA SEGURA
print("üìä Preparando resumen de resultados...")

# Inicializar diccionario de resultados
results_summary = {
    'binary_results': {k: v['accuracy'] for k, v in binary_results.items()},
    'multi_results': {k: v['accuracy'] for k, v in multi_results.items()},
    'dataset_info': {
        'shape': df.shape,
        'vulnerable_ratio': float(df['is_vulnerable'].mean()),
        'languages': int(df['language'].nunique()),
        'vulnerability_types': int(df['vulnerability_type'].nunique())
    }
}

# A√±adir importancia de caracter√≠sticas si est√° disponible
try:
    # Intentar obtener la importancia del mejor modelo binario
    if 'binary_importance_results' in globals() and binary_importance_results:
        best_binary_model_name = max(binary_results.items(), key=lambda x: x[1]['accuracy'])[0]
        if best_binary_model_name in binary_importance_results:
            feature_importance_data = binary_importance_results[best_binary_model_name].to_dict()
            results_summary['feature_importance'] = feature_importance_data
            print("‚úÖ Importancia de caracter√≠sticas incluida en el resumen")
        else:
            results_summary['feature_importance'] = {}
            print("‚ö†Ô∏è No se pudo obtener importancia del mejor modelo binario")
    else:
        results_summary['feature_importance'] = {}
        print("‚ö†Ô∏è An√°lisis de importancia no disponible")
        
except Exception as e:
    print(f"‚ö†Ô∏è Error al guardar importancia de caracter√≠sticas: {e}")
    results_summary['feature_importance'] = {}

# A√±adir informaci√≥n de los mejores modelos
try:
    best_binary_name = max(binary_results.items(), key=lambda x: x[1]['accuracy'])[0]
    best_multi_name = max(multi_results.items(), key=lambda x: x[1]['accuracy'])[0]
    
    results_summary['best_models'] = {
        'binary': best_binary_name,
        'multiclass': best_multi_name,
        'binary_accuracy': float(binary_results[best_binary_name]['accuracy']),
        'multiclass_accuracy': float(multi_results[best_multi_name]['accuracy'])
    }
    
    print("‚úÖ Informaci√≥n de mejores modelos incluida")
    
except Exception as e:
    print(f"‚ö†Ô∏è Error al guardar informaci√≥n de mejores modelos: {e}")
    results_summary['best_models'] = {}

# Guardar resumen de resultados
joblib.dump(results_summary, 'models/results_summary.pkl')

# Tambi√©n guardar como JSON para f√°cil lectura
import json

def convert_to_serializable(obj):
    """Convierte objetos a formatos serializables para JSON"""
    if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)):
        return int(obj)
    elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.ndarray,)):
        return obj.tolist()
    elif isinstance(obj, (pd.DataFrame,)):
        return obj.to_dict()
    elif isinstance(obj, (dict,)):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [convert_to_serializable(item) for item in obj]
    else:
        return str(obj)

try:
    with open('models/results_summary.json', 'w') as f:
        json.dump(convert_to_serializable(results_summary), f, indent=2)
    print("‚úÖ Resumen guardado en formato JSON")
except Exception as e:
    print(f"‚ö†Ô∏è Error al guardar JSON: {e}")

print("\n‚úÖ Modelos y resultados guardados exitosamente:")
print("   üìÅ Todos los archivos guardados en la carpeta 'models/'")
print("   üîí Modelos binarios: binary_*.pkl")
print("   üéØ Modelos multiclase: multi_*.pkl")
print("   ‚öôÔ∏è  Preprocesador: feature_preprocessor.pkl")
print("   üè∑Ô∏è  Label encoder: label_encoder.pkl")
print("   üìä Resumen: results_summary.pkl y results_summary.json")
print("   üìà Dataset: github_commits_vulnerability_dataset.csv")

# Mostrar resumen de lo guardado
print(f"\nüìã RESUMEN DE MODELOS GUARDADOS:")
print(f"   Modelos binarios: {len(binary_results)}")
print(f"   Modelos multiclase: {len(multi_results)}")
print(f"   Mejor modelo binario: {results_summary.get('best_models', {}).get('binary', 'N/A')}")
print(f"   Mejor modelo multiclase: {results_summary.get('best_models', {}).get('multiclass', 'N/A')}")

In [None]:
# VERIFICACI√ìN DE ARCHIVOS GUARDADOS
print("üîç VERIFICANDO ARCHIVOS GUARDADOS...")
print("=" * 40)

import glob

def verify_saved_files():
    """Verifica que todos los archivos se hayan guardado correctamente"""
    expected_files = [
        'models/github_commits_vulnerability_dataset.csv',
        'models/feature_preprocessor.pkl',
        'models/label_encoder.pkl',
        'models/results_summary.pkl',
        'models/results_summary.json'
    ]
    
    # A√±adir modelos binarios
    for name in binary_results.keys():
        expected_files.append(f'models/binary_{name.lower().replace(" ", "_")}.pkl')
    
    # A√±adir modelos multiclase
    for name in multi_results.keys():
        expected_files.append(f'models/multi_{name.lower().replace(" ", "_")}.pkl')
    
    print("üìã Archivos esperados:")
    missing_files = []
    existing_files = []
    
    for file_path in expected_files:
        if os.path.exists(file_path):
            file_size = os.path.getsize(file_path)
            existing_files.append((file_path, file_size))
            print(f"   ‚úÖ {file_path} ({file_size} bytes)")
        else:
            missing_files.append(file_path)
            print(f"   ‚ùå {file_path} - NO ENCONTRADO")
    
    print(f"\nüìä ESTAD√çSTICAS:")
    print(f"   Total de archivos esperados: {len(expected_files)}")
    print(f"   Archivos encontrados: {len(existing_files)}")
    print(f"   Archivos faltantes: {len(missing_files)}")
    
    if missing_files:
        print(f"\n‚ö†Ô∏è ARCHIVOS FALTANTES:")
        for missing in missing_files:
            print(f"   - {missing}")
    
    return existing_files, missing_files

# Ejecutar verificaci√≥n
existing_files, missing_files = verify_saved_files()

if not missing_files:
    print("\nüéâ ¬°Todos los archivos se guardaron correctamente!")
else:
    print(f"\n‚ö†Ô∏è Hay {len(missing_files)} archivos faltantes. Considera regenerarlos.")

#### Funci√≥n para Cargar y Usar Modelos

In [None]:
# CARGA Y VERIFICACI√ìN DE MODELOS GUARDADOS
print("üîÑ CARGANDO Y VERIFICANDO MODELOS GUARDADOS...")
print("=" * 50)

def load_and_test_models():
    """Carga los modelos guardados y realiza pruebas b√°sicas"""
    try:
        # Cargar resumen
        results_summary = joblib.load('models/results_summary.pkl')
        print("‚úÖ Resumen de resultados cargado")
        
        # Cargar preprocesador
        preprocessor_loaded = joblib.load('models/feature_preprocessor.pkl')
        print("‚úÖ Preprocesador cargado")
        
        # Cargar label encoder
        label_encoder_loaded = joblib.load('models/label_encoder.pkl')
        print("‚úÖ Label encoder cargado")
        
        # Cargar mejor modelo binario
        best_binary_name = results_summary['best_models']['binary']
        binary_model_path = f'models/binary_{best_binary_name.lower().replace(" ", "_")}.pkl'
        binary_model_loaded = joblib.load(binary_model_path)
        print(f"‚úÖ Mejor modelo binario cargado: {best_binary_name}")
        
        # Cargar mejor modelo multiclase
        best_multi_name = results_summary['best_models']['multiclass']
        multi_model_path = f'models/multi_{best_multi_name.lower().replace(" ", "_")}.pkl'
        multi_data_loaded = joblib.load(multi_model_path)
        multi_model_loaded = multi_data_loaded['model']
        multi_label_encoder = multi_data_loaded['label_encoder']
        print(f"‚úÖ Mejor modelo multiclase cargado: {best_multi_name}")
        
        # Probar con ejemplo de commit
        example_commit = {
            'language': 'Python',
            'lines_added': 45,
            'lines_deleted': 12,
            'files_changed': 2,
            'cyclomatic_complexity': 18,
            'has_input_validation': 0,
            'has_escape_functions': 0,
            'has_raw_queries': 1,
            'has_string_concatenation': 1,
            'has_inner_html': 0,
            'developer_experience': 1.5,
            'previous_vulnerabilities': 2,
            'change_size': 57,
            'risk_factor': 0.4
        }
        
        # Realizar predicci√≥n de prueba
        prediction = predict_commit_security(
            binary_model_loaded,
            multi_model_loaded,
            multi_label_encoder,
            example_commit
        )
        
        print(f"\nüéØ PREDICCI√ìN DE PRUEBA:")
        print(f"   Es vulnerable: {prediction['is_vulnerable']}")
        print(f"   Tipo: {prediction['vulnerability_type']}")
        print(f"   Confianza: {prediction['vulnerability_type_confidence']:.2%}")
        print(f"   Nivel de riesgo: {prediction['risk_level']}")
        
        return True
        
    except Exception as e:
        print(f"‚ùå Error al cargar modelos: {e}")
        return False

# Ejecutar prueba de carga
load_success = load_and_test_models()

if load_success:
    print("\nüéâ ¬°Todos los modelos se cargaron y probaron exitosamente!")
else:
    print("\n‚ö†Ô∏è Hubo problemas al cargar los modelos")

***Estructura de Entrada para Predicciones***

In [None]:
nuevo_commit = {
    'language': 'Python',  # JavaScript, Java, C++, etc.
    'lines_added': 50,
    'lines_deleted': 10,
    'files_changed': 2,
    'cyclomatic_complexity': 15,
    'has_input_validation': 1,  # 1 = S√≠, 0 = No
    'has_escape_functions': 1,
    'has_raw_queries': 0,
    'has_string_concatenation': 0,
    'has_inner_html': 0,
    'developer_experience': 2.5,
    'previous_vulnerabilities': 1,
    'change_size': 60,
    'risk_factor': 0.3
}

# RESUMEN FINAL DEL PROYECTO

In [None]:
# RESUMEN FINAL DEL PROYECTO
print("üéâ PROYECTO COMPLETADO EXITOSAMENTE!")
print("=" * 50)

def print_project_summary():
    """Imprime un resumen ejecutivo del proyecto"""
    
    # Cargar resumen si est√° disponible
    try:
        results_summary = joblib.load('models/results_summary.pkl')
        
        print("\nüìä RESUMEN EJECUTIVO:")
        print(f"üìç Dataset: {results_summary['dataset_info']['shape'][0]:,} commits")
        print(f"üîí Proporci√≥n de vulnerabilidades: {results_summary['dataset_info']['vulnerable_ratio']:.2%}")
        print(f"üåç Lenguajes analizados: {results_summary['dataset_info']['languages']}")
        print(f"üéØ Tipos de vulnerabilidad: {results_summary['dataset_info']['vulnerability_types']}")
        
        print(f"\nüèÜ MEJORES MODELOS:")
        best_models = results_summary.get('best_models', {})
        if best_models:
            print(f"üîí Clasificaci√≥n Binaria: {best_models.get('binary', 'N/A')} - {best_models.get('binary_accuracy', 0):.2%}")
            print(f"üéØ Clasificaci√≥n Multiclase: {best_models.get('multiclass', 'N/A')} - {best_models.get('multiclass_accuracy', 0):.2%}")
        
        print(f"\nüìà RENDIMIENTO DE MODELOS BINARIOS:")
        for model, accuracy in results_summary['binary_results'].items():
            print(f"   {model}: {accuracy:.4f}")
            
    except Exception as e:
        print(f"‚ö†Ô∏è No se pudo cargar el resumen: {e}")
        # Mostrar informaci√≥n b√°sica
        print(f"\nüìä DATASET:")
        print(f"   Filas: {df.shape[0]:,}")
        print(f"   Columnas: {df.shape[1]}")
        print(f"   Vulnerabilidades: {df['is_vulnerable'].mean():.2%}")

    print(f"\nüíæ ARCHIVOS GENERADOS:")
    model_files = glob.glob('models/*.pkl') + glob.glob('models/*.csv') + glob.glob('models/*.json')
    for file_path in model_files:
        file_name = os.path.basename(file_path)
        file_size = os.path.getsize(file_path)
        print(f"   {file_name} ({file_size} bytes)")

    print(f"\nüöÄ PR√ìXIMOS PASOS RECOMENDADOS:")
    print("   1. Integrar con APIs reales de GitHub")
    print("   2. Implementar en pipeline CI/CD")
    print("   3. Expandir a m√°s tipos de vulnerabilidades")
    print("   4. Recopilar dataset con commits reales")
    print("   5. Implementar sistema de alertas tempranas")

    print(f"\nüéØ USO EN PRODUCCI√ìN:")
    print("   from models import load_and_predict")
    print("   resultado = load_and_predict(nuevo_commit)")
    print("   if resultado['risk_level'] == 'ALTO':")
    print("       print('üö® Revisi√≥n de seguridad requerida')")

# Ejecutar resumen
print_project_summary()