## Proyecto Prediccion de atraso de vuelos ##

### Entrenamiento, Evaluación y exportación ###

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# 1. Cargar dataset con features
df = pd.read_csv("flights_features.csv")

# Separar variables y target
X = df.drop(['delayed','date'], axis=1)
y = df['delayed']

# 2. Identificar columnas categóricas y numéricas
categorical_cols = ['UniqueCarrier','origin','dest','day_of_week','part_of_day']
numeric_cols = [col for col in X.columns if col not in categorical_cols]

# 3. Preprocesador con OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ])

# 4. Pipeline con RandomForest optimizado
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=50,   # menos árboles → más rápido
        max_depth=10,      # limitar profundidad → evita overfitting
        random_state=42,
        n_jobs=-1          # usa todos los núcleos disponibles
    ))
])

# 5. División Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 6. Entrenamiento
rf_pipeline.fit(X_train, y_train)

# 7. Evaluación
y_pred = rf_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))

# 8. Validación cruzada (solo sobre una muestra para rapidez)
df_sample = df.sample(20000, random_state=42)
X_sample = df_sample.drop(['delayed','date'], axis=1)
y_sample = df_sample['delayed']

scores = cross_val_score(rf_pipeline, X_sample, y_sample, cv=3, scoring='f1')
print("F1 promedio (CV, muestra):", scores.mean())

# 9. Exportación del pipeline completo
joblib.dump(rf_pipeline, "modelo_rf_pipeline.pkl", compress=3)

Accuracy: 0.8528871391076116
Precision: 0.9094508171759212
Recall: 0.7838145231846019
F1-score: 0.8419717118556459
F1 promedio (CV, muestra): 0.8389752712693989


['modelo_rf_pipeline.pkl']