## Proyecto Prediccion de atraso de vuelos ##

## Creacion de variables y balanceo #

In [1]:

import pandas as pd
import numpy as np
from sklearn.utils import resample

# 1. Cargar dataset limpio
df = pd.read_csv("flights_clean.csv")

# 2. Crear variable objetivo (delayed)
# Usamos arr_delay para definir la etiqueta, luego la eliminamos para evitar fuga de información
df['delayed'] = np.where(df['arr_delay'] > 15, 1, 0)
df = df.drop(['arr_delay'], axis=1)

# 3. Feature engineering
# Convertir fecha y extraer día de la semana
df['date'] = pd.to_datetime(df[['year','month','day']])
df['day_of_week'] = df['date'].dt.day_name()

# Crear variable part_of_day
df['part_of_day'] = pd.cut(df['hour'],
                           bins=[0,6,12,18,24],
                           labels=['Madrugada','Mañana','Tarde','Noche'],
                           include_lowest=True)

# 4. Balanceo de clases (más rápido con sample reducido)
# Para pruebas
df_sample = df.sample(50000, random_state=42)

df_majority = df_sample[df_sample.delayed==0]
df_minority = df_sample[df_sample.delayed==1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_features = pd.concat([df_majority, df_minority_upsampled])

# 5. Exportar dataset con features
df_features.to_csv("flights_features.csv", index=False)

print("Dataset final con features:", df_features.shape)
print(df_features.head())

Dataset final con features: (76200, 18)
        year  month  day  dep_time  sched_dep_time  dep_delay UniqueCarrier  \
76295   2013     11   24    1025.0            1030       -5.0            AA   
114510  2013      2    7    1321.0            1329       -8.0            EV   
188885  2013      5    1    1854.0            1900       -6.0            UA   
264170  2013      7   24     758.0             805       -7.0            9E   
300403  2013      9    1     735.0             740       -5.0            WN   

        flight origin dest  air_time  distance  hour  minute  delayed  \
76295       19    JFK  LAX     331.0      2475    10      30        0   
114510    4280    EWR  BWI      39.0       169    13      29        0   
188885    1112    LGA  ORD     100.0       733    19       0        0   
264170    3611    JFK  PIT      64.0       340     8       5        0   
300403     744    EWR  MDW     106.0       711     7      40        0   

             date day_of_week part_of_day  
76