In [44]:
pip install matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

In [62]:
df_model = pd.read_csv("../data/df_reducido.csv.gz")

In [63]:
TARGET = "Delayed"

CAT_COLS = [
    "Marketing_Airline_Network",
    "OriginCityName",
    "DestCityName"
]

CYCLIC_COLS = {
    "DayofWeek": 7,
    "Month": 12,
    "CRSDepTime": 1440
}

NUM_COLS = ["Distance"]
BOOL_COLS = ["Holidays"]

In [64]:
def add_cyclic_features(df: pd.DataFrame, cyclic_cols: dict) -> pd.DataFrame:
    df = df.copy()

    for col, period in cyclic_cols.items():
        df[f"{col}_sin"] = np.sin(2 * np.pi * df[col] / period)
        df[f"{col}_cos"] = np.cos(2 * np.pi * df[col] / period)

    df = df.drop(columns=list(cyclic_cols.keys()))
    return df

In [65]:
from sklearn.base import BaseEstimator, TransformerMixin

class CyclicEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cyclic_cols: dict):
        self.cyclic_cols = cyclic_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return add_cyclic_features(X, self.cyclic_cols)

In [6]:
"""
df_work = df_model.copy()

df_work = add_cyclic_features(df_work, CYCLIC_COLS)
"""

In [66]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7603890 entries, 0 to 7603889
Data columns (total 9 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   Marketing_Airline_Network  object 
 1   OriginCityName             object 
 2   DestCityName               object 
 3   Distance                   float64
 4   Month                      int64  
 5   DayofWeek                  int64  
 6   Holidays                   bool   
 7   Delayed                    int64  
 8   CRSDepTime                 int64  
dtypes: bool(1), float64(1), int64(4), object(3)
memory usage: 471.4+ MB


In [67]:
from sklearn.model_selection import train_test_split

df_work, _ = train_test_split(
    df_model,
    train_size=500_000,
    stratify=df_model[TARGET],
    random_state=42
)

In [68]:
X = df_work.drop(columns=[TARGET])
y = df_work[TARGET]

In [69]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [70]:
pip install category-encoders

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [71]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

In [72]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, smoothing=20):
        self.cols = cols
        self.smoothing = smoothing

    def fit(self, X, y):
        X = X.copy()
        y = y.copy()

        self.global_mean_ = y.mean()
        self.encoding_ = {}

        for col in self.cols:
            stats = (
                pd.concat([X[col], y], axis=1)
                .groupby(col)[y.name]
                .agg(["mean", "count"])
            )

            smooth = (
                (stats["count"] * stats["mean"] +
                 self.smoothing * self.global_mean_)
                / (stats["count"] + self.smoothing)
            )

            self.encoding_[col] = smooth

        return self

    def transform(self, X):
        X = X.copy()

        for col in self.cols:
            X[col] = (
                X[col]
                .map(self.encoding_[col])
                .fillna(self.global_mean_)
            )

        return X

In [73]:
te = TargetEncoder(cols=CAT_COLS, smoothing=20)
X_train_te = te.fit_transform(X_train, y_train)
X_test_te = te.transform(X_test)

X_train_te.head()

Unnamed: 0,Marketing_Airline_Network,OriginCityName,DestCityName,Distance,Month,DayofWeek,Holidays,CRSDepTime
944685,0.374763,0.210402,0.310216,1747.0,10,3,False,1168
5420614,0.237713,0.253779,0.239953,447.0,9,0,False,677
2827385,0.277714,0.251582,0.310216,2717.0,4,5,False,450
399507,0.222706,0.341195,0.310163,937.0,1,3,False,525
533431,0.237713,0.284075,0.247255,868.0,10,6,False,995


In [74]:
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

dummy_pipeline = Pipeline(
    steps=[
        ("cyclic", CyclicEncoder(CYCLIC_COLS)),
        ("target_enc", TargetEncoder(cols=CAT_COLS, smoothing=20)),
        ("model", DummyClassifier(strategy="most_frequent"))
    ]
)

dummy_pipeline.fit(X_train, y_train)

y_pred_dummy = dummy_pipeline.predict(X_test)

print("=== Dummy Classifier (Pipeline) ===")
print(classification_report(y_test, y_pred_dummy))

=== Dummy Classifier (Pipeline) ===
              precision    recall  f1-score   support

           0       0.76      1.00      0.86     75521
           1       0.00      0.00      0.00     24479

    accuracy                           0.76    100000
   macro avg       0.38      0.50      0.43    100000
weighted avg       0.57      0.76      0.65    100000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

rf_pipeline = Pipeline(
    steps=[
        ("cyclic", CyclicEncoder(CYCLIC_COLS)),
        ("target_enc", TargetEncoder(cols=CAT_COLS, smoothing=20)),
        ("model", RandomForestClassifier(
            n_estimators=300,
            max_depth=20,
            min_samples_leaf=50,
            class_weight={0: 1, 1: 3},
            n_jobs=-1,
            random_state=42
        ))
    ]
)

rf_pipeline.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('cyclic', ...), ('target_enc', ...), ...]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,cyclic_cols,"{'CRSDepTime': 1440, 'DayofWeek': 7, 'Month': 12}"

0,1,2
,cols,"['Marketing_Airline_Network', 'OriginCityName', ...]"
,smoothing,20

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",300
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",20
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",50
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [76]:
y_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]

threshold = 0.40
y_pred_rf = (y_prob_rf >= threshold).astype(int)

print(f"=== RandomForest (Pipeline) | threshold={threshold} ===")
print(classification_report(y_test, y_pred_rf))

=== RandomForest (Pipeline) | threshold=0.4 ===
              precision    recall  f1-score   support

           0       0.87      0.46      0.60     75521
           1       0.32      0.78      0.45     24479

    accuracy                           0.54    100000
   macro avg       0.59      0.62      0.53    100000
weighted avg       0.73      0.54      0.56    100000



In [77]:
import joblib

MODEL_PATH = "flight_delay_pipeline.joblib"

joblib.dump(rf_pipeline, MODEL_PATH)

['flight_delay_pipeline.joblib']

In [78]:
loaded_pipeline = joblib.load(MODEL_PATH)

y_prob_loaded = loaded_pipeline.predict_proba(X_test)[:, 1]
y_pred_loaded = (y_prob_loaded >= 0.4).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_loaded))

              precision    recall  f1-score   support

           0       0.87      0.46      0.60     75521
           1       0.32      0.78      0.45     24479

    accuracy                           0.54    100000
   macro avg       0.59      0.62      0.53    100000
weighted avg       0.73      0.54      0.56    100000



In [79]:
import pandas as pd
import holidays

# Define país una sola vez (esto es configuración, no lógica)
HOLIDAYS_CALENDAR = holidays.US()


def extract_features(payload: dict) -> pd.DataFrame:
    """
    Convierte un JSON de entrada en un DataFrame
    compatible con el pipeline entrenado.
    """

    # --- Validaciones mínimas ---
    required_keys = {
        "aerolinea",
        "origen",
        "destino",
        "fecha_partida",
        "distancia_km",
    }

    missing = required_keys - payload.keys()
    if missing:
        raise ValueError(f"Faltan campos obligatorios: {missing}")

    # --- Parseo de fecha ---
    dt = pd.to_datetime(payload["fecha_partida"], errors="raise")

    features = {
        "Marketing_Airline_Network": payload["aerolinea"],
        "OriginCityName": payload["origen"],
        "DestCityName": payload["destino"],
        "Distance": float(payload["distancia_km"]),
        "Month": dt.month,
        "DayofWeek": dt.dayofweek,          # 0 = lunes
        "CRSDepTime": dt.hour * 60 + dt.minute,  # minutos desde medianoche
        "Holidays": dt.date() in HOLIDAYS_CALENDAR,
    }

    return pd.DataFrame([features])

In [80]:
payload_test = {
    "aerolinea": "AA",
    "origen": "New York, NY",
    "destino": "Los Angeles, CA",
    "fecha_partida": "2025-11-10T14:30:00",
    "distancia_km": 3983
}

X_api = extract_features(payload_test)
print(X_api)
print(X_api.dtypes)

  Marketing_Airline_Network OriginCityName     DestCityName  Distance  Month  \
0                        AA   New York, NY  Los Angeles, CA    3983.0     11   

   DayofWeek  CRSDepTime  Holidays  
0          0         870     False  
Marketing_Airline_Network     object
OriginCityName                object
DestCityName                  object
Distance                     float64
Month                          int64
DayofWeek                      int64
CRSDepTime                     int64
Holidays                        bool
dtype: object


In [81]:
X_api

Unnamed: 0,Marketing_Airline_Network,OriginCityName,DestCityName,Distance,Month,DayofWeek,CRSDepTime,Holidays
0,AA,"New York, NY","Los Angeles, CA",3983.0,11,0,870,False


In [82]:
pipeline = joblib.load(MODEL_PATH)

In [83]:
THRESHOLD = 0.40  # decisión de negocio


def predict_delay(payload: dict) -> dict:
    """
    Recibe un JSON de vuelo y devuelve
    la predicción de retraso y su probabilidad.
    """

    # 1️⃣ JSON → DataFrame
    X = extract_features(payload)

    # 2️⃣ Probabilidad de retraso (clase 1)
    prob_delay = pipeline.predict_proba(X)[0, 1]

    # 3️⃣ Decisión final
    prediction = int(prob_delay >= THRESHOLD)

    # 4️⃣ Respuesta amigable
    return {
        "prevision": "Retrasado" if prediction == 1 else "Puntual",
        "probabilidad": round(float(prob_delay), 2),
    }

In [84]:
payload_test = {
    "aerolinea": "AA",
    "origen": "New York, NY",
    "destino": "Los Angeles, CA",
    "fecha_partida": "2025-11-10T14:30:00",
    "distancia_km": 3983
}

result = predict_delay(payload_test)
print(result)

{'prevision': 'Retrasado', 'probabilidad': 0.49}
