In [1]:
# 1. Imports básicos
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

plt.style.use("default")
sns.set_theme()


ModuleNotFoundError: No module named 'numpy'

In [None]:
# 2. Leitura dos arquivos
path_flights = "flights.csv"      # ajuste o caminho se precisar
path_airlines = "airlines.csv"
path_airports = "airports.csv"

flights = pd.read_csv(path_flights)
airlines = pd.read_csv(path_airlines)
airports = pd.read_csv(path_airports)

flights.shape, airlines.shape, airports.shape


In [None]:
# Olhando o cabeçalho
flights.head()


In [None]:
# Informação geral e tipos
flights.info()


In [None]:
# Estatísticas descritivas numéricas
flights.describe().T


In [None]:
# Percentual de nulos por coluna
null_pct = flights.isna().mean().sort_values(ascending=False)
null_pct


In [None]:
# Visualização dos principais atrasos
plt.figure(figsize=(8,5))
sns.histplot(flights["ARRIVAL_DELAY"].dropna(), bins=80, kde=True)
plt.title("Distribuição do atraso na chegada (ARRIVAL_DELAY)")
plt.xlabel("Minutos de atraso (+) ou adiantado (-)")
plt.ylabel("Frequência")
plt.show()


In [None]:
# Atraso médio por companhia aérea
# join com airlines para nome legível
delay_by_airline = (
    flights.groupby("AIRLINE")["ARRIVAL_DELAY"]
    .mean()
    .reset_index()
    .merge(airlines, left_on="AIRLINE", right_on="IATA_CODE", how="left")
    .sort_values("ARRIVAL_DELAY", ascending=False)
)

plt.figure(figsize=(10,6))
sns.barplot(
    data=delay_by_airline,
    y="AIRLINE",
    x="ARRIVAL_DELAY"
)
plt.yticks(
    ticks=range(len(delay_by_airline)),
    labels=delay_by_airline["AIRLINE"].astype(str) + " - " + delay_by_airline["AIRLINE_y"].astype(str)
)
plt.title("Atraso médio na chegada por companhia aérea")
plt.xlabel("Atraso médio de chegada (min)")
plt.ylabel("Companhia aérea")
plt.tight_layout()
plt.show()


In [None]:
# Atraso médio por aeroporto de origem (top 20 mais movimentados)
top_origens = (
    flights["ORIGIN_AIRPORT"]
    .value_counts()
    .head(20)
    .index
)

delay_by_origin = (
    flights[flights["ORIGIN_AIRPORT"].isin(top_origens)]
    .groupby("ORIGIN_AIRPORT")["ARRIVAL_DELAY"]
    .mean()
    .reset_index()
    .merge(airports[["IATA_CODE", "AIRPORT", "CITY", "STATE"]],
           left_on="ORIGIN_AIRPORT", right_on="IATA_CODE", how="left")
    .sort_values("ARRIVAL_DELAY", ascending=False)
)

plt.figure(figsize=(10,6))
sns.barplot(
    data=delay_by_origin,
    y="ORIGIN_AIRPORT",
    x="ARRIVAL_DELAY"
)
plt.title("Atraso médio na chegada por aeroporto de origem (Top 20)")
plt.xlabel("Atraso médio de chegada (min)")
plt.ylabel("Aeroporto de origem")
plt.tight_layout()
plt.show()


In [None]:
# Remover linhas sem informação de atraso
flights_model = flights.dropna(subset=["ARRIVAL_DELAY"]).copy()

# Variável alvo binária
flights_model["IS_DELAYED"] = (flights_model["ARRIVAL_DELAY"] > 15).astype(int)

flights_model["IS_DELAYED"].value_counts(normalize=True)


In [None]:
# Criar coluna de data
flights_model["FLIGHT_DATE"] = pd.to_datetime(
    flights_model[["YEAR", "MONTH", "DAY"]]
)

# Hora de saída agendada (SCHEDULED_DEPARTURE é HHMM)
flights_model["SCHED_DEP_HOUR"] = (
    flights_model["SCHEDULED_DEPARTURE"]
    .fillna(0)
    .astype(int)
    .floordiv(100)
)

# Período do dia
def period_of_day(hour):
    if hour < 6:
        return "dawn"       # madrugada
    elif hour < 12:
        return "morning"
    elif hour < 18:
        return "afternoon"
    else:
        return "night"

flights_model["DEP_PERIOD"] = flights_model["SCHED_DEP_HOUR"].apply(period_of_day)


In [None]:
target = "IS_DELAYED"

numeric_features = [
    "SCHED_DEP_HOUR",
    "DISTANCE",
    "DAY_OF_WEEK",
    "MONTH"
]

categorical_features = [
    "AIRLINE",
    "ORIGIN_AIRPORT",
    "DESTINATION_AIRPORT",
    "DEP_PERIOD"
]

# Amostra opcional para reduzir tamanho (caso o dataset seja muito grande)
# descomente se estiver pesado
# flights_model = flights_model.sample(300_000, random_state=42)

X = flights_model[numeric_features + categorical_features]
y = flights_model[target]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape


In [None]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [None]:
def evaluate_classifier(name, model, X_train, X_test, y_train, y_test):
    print(f"\n========== {name} ==========")
    y_pred = model.predict(X_test)

    print("\nClassification report:")
    print(classification_report(y_test, y_pred, digits=4))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Matriz de confusão - {name}")
    plt.xlabel("Predito")
    plt.ylabel("Real")
    plt.show()

    # Probabilidades para ROC-AUC (se o modelo permitir)
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_prob)
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        print(f"ROC-AUC: {auc:.4f}")

        plt.figure(figsize=(5,4))
        plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.3f})")
        plt.plot([0,1], [0,1], "--", color="gray")
        plt.xlabel("Falso positivo")
        plt.ylabel("Verdadeiro positivo")
        plt.title(f"Curva ROC - {name}")
        plt.legend()
        plt.show()


In [None]:
log_reg_clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", LogisticRegression(max_iter=1000, n_jobs=-1))
])

log_reg_clf.fit(X_train, y_train)
evaluate_classifier("Logistic Regression", log_reg_clf, X_train, X_test, y_train, y_test)


In [None]:
rf_clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        n_jobs=-1,
        random_state=42,
        class_weight="balanced_subsample"
    ))
])

rf_clf.fit(X_train, y_train)
evaluate_classifier("Random Forest", rf_clf, X_train, X_test, y_train, y_test)


In [None]:
# Agregação por aeroporto de origem
airport_stats = (
    flights_model
    .groupby("ORIGIN_AIRPORT")
    .agg(
        mean_arr_delay=("ARRIVAL_DELAY", "mean"),
        mean_dep_delay=("DEPARTURE_DELAY", "mean"),
        total_flights=("ARRIVAL_DELAY", "count")
    )
    .reset_index()
)

airport_stats.head()


In [None]:
# Merge com info geográfica dos aeroportos (opcional)
airport_stats = airport_stats.merge(
    airports[["IATA_CODE", "AIRPORT", "CITY", "STATE", "LATITUDE", "LONGITUDE"]],
    left_on="ORIGIN_AIRPORT",
    right_on="IATA_CODE",
    how="left"
)

airport_stats.head()


In [None]:
# Features numéricas para clusterizar
cluster_features = ["mean_arr_delay", "mean_dep_delay", "total_flights"]

X_cluster = airport_stats[cluster_features].fillna(0).copy()

scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)


In [None]:
# KMeans com 4 clusters (pode testar outros k)
kmeans = KMeans(n_clusters=4, random_state=42)
airport_stats["CLUSTER"] = kmeans.fit_predict(X_cluster_scaled)

airport_stats[["ORIGIN_AIRPORT", "AIRPORT", "STATE", "mean_arr_delay", "mean_dep_delay", "total_flights", "CLUSTER"]].head(20)


In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    data=airport_stats,
    x="mean_dep_delay",
    y="mean_arr_delay",
    hue="CLUSTER",
    size="total_flights",
    sizes=(20, 200),
    alpha=0.8
)
plt.title("Clusters de aeroportos por perfil de atraso")
plt.xlabel("Atraso médio na partida (min)")
plt.ylabel("Atraso médio na chegada (min)")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
# Atrasos por período do dia
delay_by_period = flights_model.groupby("DEP_PERIOD")["ARRIVAL_DELAY"].mean().reset_index()

plt.figure(figsize=(6,4))
sns.barplot(data=delay_by_period, x="DEP_PERIOD", y="ARRIVAL_DELAY", order=["dawn", "morning", "afternoon", "night"])
plt.title("Atraso médio por período do dia")
plt.xlabel("Período do dia")
plt.ylabel("Atraso médio de chegada (min)")
plt.show()


In [None]:
# Atrasos por mês (sazonalidade)
delay_by_month = flights_model.groupby("MONTH")["ARRIVAL_DELAY"].mean().reset_index()

plt.figure(figsize=(8,4))
sns.lineplot(data=delay_by_month, x="MONTH", y="ARRIVAL_DELAY", marker="o")
plt.title("Atraso médio por mês")
plt.xlabel("Mês")
plt.ylabel("Atraso médio de chegada (min)")
plt.xticks(range(1,13))
plt.show()
