In [1]:
import sweetviz as sv
import pandas as pd

# Cargo los datos
df = pd.read_csv('FlightDelays_Data_3.0.csv')

report = sv.analyze(df.dropna(subset=['Canceled']),target_feat='Canceled')
report.show_html('my_report.html')

                                             |                                             | [  0%]   00:00 ->…

Report my_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [4]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Cargo los datos
df = pd.read_csv('FlightDelays_Data_3.0.csv')

# Manejo de variables categoricas y valores faltantes
categorical_cols = ["UniqueCarrier"]
df = pd.get_dummies(df, columns=categorical_cols)
df.dropna(inplace=True)

# Separo target y features
X = df.drop("Canceled", axis=1)
y = df["Canceled"]

# Defino los modelos y los hiperparámetros a buscar
models = {
    'BaggingClassifier': (BaggingClassifier(), {'n_estimators': [10, 50, 100]}),
    'DecisionTreeClassifier': (DecisionTreeClassifier(), {'max_depth': [3, 5, 7]}),
    'HistGradientBoostingClassifier': (HistGradientBoostingClassifier(), {'learning_rate': [0.001, 0.01, 0.1], 'max_depth': [3, 5, 7]}),
    'RandomForestClassifier': (RandomForestClassifier(), {'n_estimators': [10, 50, 100]}),
    'StackingClassifier': (StackingClassifier(estimators=[('rf', RandomForestClassifier()), ('svc', SVC())]), {}),
    'VotingClassifier': (VotingClassifier(estimators=[('xgb', XGBClassifier()), ('lgbm', LGBMClassifier())]), {}),
    'LGBMClassifier': (LGBMClassifier(), {'num_leaves': [5, 10, 15], 'learning_rate': [0.001, 0.01, 0.1]}),
    'XGBClassifier': (XGBClassifier(), {'learning_rate': [0.001, 0.01, 0.1], 'max_depth': [3, 5, 7]})
}

# Itero sobre los modelos
for name, (model, param_grid) in models.items():
    # Aplico GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X, y)
    
    # Obtengo el mejor modelo
    best_model = grid_search.best_estimator_
    
    # Imprimo los hiperparámetros del mejor modelo
    print(f"Mejores hiperparámetros para {name}: {grid_search.best_params_}")
    
    # Imprimo el score del mejor modelo
    print(f"Mejor score para {name}: {grid_search.best_score_}")


Mejores hiperparámetros para BaggingClassifier: {'n_estimators': 10}
Mejor score para BaggingClassifier: 0.9958298582151792
Mejores hiperparámetros para DecisionTreeClassifier: {'max_depth': 5}
Mejor score para DecisionTreeClassifier: 0.9956630525437864
Mejores hiperparámetros para HistGradientBoostingClassifier: {'learning_rate': 0.01, 'max_depth': 3}
Mejor score para HistGradientBoostingClassifier: 0.9963302752293577
Mejores hiperparámetros para RandomForestClassifier: {'n_estimators': 50}
Mejor score para RandomForestClassifier: 0.9963302752293577
Mejores hiperparámetros para StackingClassifier: {}
Mejor score para StackingClassifier: 0.9961634695579649
[LightGBM] [Info] Number of positive: 799, number of negative: 3997
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 4796, number 

[LightGBM] [Info] Number of positive: 800, number of negative: 3996
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1074
[LightGBM] [Info] Number of data points in the train set: 4796, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166806 -> initscore=-1.608437
[LightGBM] [Info] Start training from score -1.608437
[LightGBM] [Info] Number of positive: 799, number of negative: 3997
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 4796, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166597 -> initscore=-1.609938
[LightGBM] [Info] Start training from score -1.609938
[LightGBM] [Info] Numb

[LightGBM] [Info] Number of positive: 799, number of negative: 3997
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 4796, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166597 -> initscore=-1.609938
[LightGBM] [Info] Start training from score -1.609938
[LightGBM] [Info] Number of positive: 799, number of negative: 3997
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 4796, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166597 -> initscore=-1.609938
[LightGBM] [Info] Start training from score -1.609938
[LightGBM] [Info] Numb

In [15]:
# Obtener la importancia de las características
feature_importance = best_model.feature_importances_

# Crear un DataFrame para mostrar las características y su importancia
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Ordenar las características por importancia
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Imprimir la tabla de importancia de características
print(feature_importance_df)

            Feature  Importance
4          DepDelay     0.56691
3          ArrDelay     0.43309
0             Month     0.00000
1     DepartureTime     0.00000
2  SchedElapsedTime     0.00000
5          Distance     0.00000
6  UniqueCarrier_AA     0.00000
7  UniqueCarrier_DL     0.00000
8  UniqueCarrier_UA     0.00000


In [39]:
# Crear un DataFrame con los datos de entrada
input_data = pd.DataFrame([[12, 814, 134, 0, 0, 679, 0, 0, 1]], columns=X.columns)

# Realizar la predicción
prediction = best_model.predict(input_data)

# Imprimir la predicción
print(prediction)

[1]
