In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

# pd.set_option('display.max_columns', 500)

# Load the data
flights_with_products = pd.read_parquet("../data/flights_with_products.parquet")
# flights_with_products.head()

In [2]:
flights_data = flights_with_products.dropna()
flights_data = flights_data.reset_index(drop=True)


In [3]:
flights_data

Unnamed: 0,Flight_ID,Aeronave,DepartureStation,ArrivalStation,Destination_Type,Origin_Type,STD,STA,Capacity,Passengers,...,Zona-2 San Nicolas 10pax,Zona-2 San Nicolas 4pax,Zona-3 Centro Mty 10pax,Zona-3 Centro Mty 4pax,Zona-4 Tec-Mty 10pax,Zona-4 Tec-Mty 4pax,Zona-5 Sanpedro 10pax,Zona-5 Sanpedro 4pax,Zona-6 Sancatarina 10pax,Zona-6 Sancatarina 4pax
0,ab954014077430bd842cfa305a55c0f8,XA-VBY,AT,AZ,Ciudad Fronteriza,Ciudad Principal,2023-10-19 11:40:00,2023-10-19 14:25:00,240,229.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,efd86c996035dacdca7a0ccb2560dda1,XA-VIX,BM,AV,MX Amigos y Familia,Ciudad Fronteriza,2023-07-03 00:55:00,2023-07-03 04:55:00,186,197.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,d0987ee648eea254063bfe2b39571b67,XA-VAP,BA,AB,Playa,Ciudad Principal,2023-02-10 08:40:00,2023-02-10 09:50:00,186,162.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,d0987ee648eea254063bfe2b39571b67,XA-VAP,AW,BA,Ciudad Principal,Ciudad Principal,2023-02-10 06:30:00,2023-02-10 08:00:00,186,157.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3b5df8805161ea827d2f2e4298c38e06,XA-VBY,AJ,AR,Playa,MX Amigos y Familia,2023-09-07 17:10:00,2023-09-07 18:05:00,240,183.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106086,9bbd9f3a2bbcaeae864d17e3ee544d2c,XA-VAE,AT,BT,Playa,Ciudad Principal,2023-12-06 12:50:00,2023-12-06 16:10:00,186,191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106087,d9ef769565064b1ff1ad90d6d18f69cb,XA-VYF,AO,BT,Playa,Ciudad Principal,2023-12-26 06:20:00,2023-12-26 10:10:00,180,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106088,598035669a4f0fa2ce1207977932df9e,XA-VAI,AW,BT,Playa,Ciudad Principal,2023-12-28 10:25:00,2023-12-28 13:50:00,180,142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106089,cc7c1c5e6fd132fd0bdab3a35aac33c0,XA-VBK,BM,BT,Playa,Ciudad Fronteriza,2023-12-29 07:30:00,2023-12-29 14:50:00,240,189.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Convertir las columnas de fecha y hora
flights_data['STD'] = pd.to_datetime(flights_data['STD'])
flights_data['STA'] = pd.to_datetime(flights_data['STA'])

# Extraer características de fecha y hora
flights_data['STD_hour'] = flights_data['STD'].dt.hour
flights_data['STA_hour'] = flights_data['STA'].dt.hour
flights_data['STD_day_of_week'] = flights_data['STD'].dt.dayofweek
flights_data['STA_day_of_week'] = flights_data['STA'].dt.dayofweek

In [5]:
x_columns = ['Aeronave', 'DepartureStation', 'ArrivalStation', 'Destination_Type', 'Origin_Type', 'Capacity', 'Passengers', 'Bookings', 'STD_hour', 'STA_hour', 'STD_day_of_week', 'STA_day_of_week']
y_columns = [col for col in flights_data.columns if col not in x_columns + ['Flight_ID'] + ['STD'] + ['STA']]


In [9]:
X = flights_data[x_columns]
y = flights_data[y_columns]

In [10]:
y = y[['Agua Natural 600 Ml',
 'Amstel Ultra',
 'Arandano',
 'Arandano Mango Mix',
 'Arcoiris',
 'Baileys',
 'Baileys ',
 'Botana Sabritas Con Dip De Queso',
 'Cafe 19 Cafe Clasico',
 'Cafe 19 Capuchino',
 'Cafe 19 Chiapas',
 'Cafe Costa',
 'Cafe De Olla',
 'Capitan Morning',
 'Capitan Morning Con Pan Dulce',
 'Carne Seca Habanero',
 'Carne Seca Original',
 'Cerveza Charter',
 'Charter Cheve Doble',
 'Charter Licor Doble',
 'Cheetos',
 'Cheetos Flamin Hot',
 'Chokis',
 'Ciel Mineralizada',
 'Club Sandwich',
 'Coca Cola Dieta',
 'Coca Cola Regular',
 'Coca Sin Azucar',
 'Corajillo',
 'Corajillo Baileys ',
 'Cuerno Clasico De Pavo',
 'Cuerno Individual Charter',
 'Dip De Queso',
 'Doritos Nacho',
 'Emperador Chocolate',
 'Emperador Vainilla',
 'Fanta De Naranja',
 'Fritos Limon Y Sal',
 'Frutos Secos Enchilados',
 'Galleta De Arandano Relleno De Q/Crema',
 'Galleta De Chispas De Chocolate',
 'Galleta De Chocolate',
 'Go Nuts',
 'Gomita Enchilada La Cueva',
 'Heineken 0',
 'Heineken Original',
 'Heineken Silver',
 'Jack And Coke',
 'Jugo De Mango',
 'Jugo De Manzana',
 'Jw Red Label',
 'Jw Red Label ',
 'Kacang Flaming Hot',
 'Leche De Chocolate Sc',
 'Leche De Fresa Sc',
 'Licor + Refresco',
 'Licor Charter',
 'Luxury Nut Mix',
 'Mafer Sin Sal',
 'Mega Cuerno Clasico',
 'Mega Cuerno Tripulacion',
 'Muffin Integral',
 'Nishikawa Japones',
 'Nishikawa Salado',
 'Nissin Dark Dragon',
 'Nissin Fuego',
 'Nissin Limon Y Habanero',
 'Nissin Picante',
 'Nissin Res',
 'Nueces De Arbol Mix',
 'Nutty Berry Mix',
 'Panini Clasico',
 'Panini Integral',
 'Protein Adventure',
 'Quaker Avena Frutos Rojos',
 'Quaker Avena Moras',
 'Quaker Granola',
 'Quaker Natural Balance',
 'Rancheritos',
 'Ron Bacardi',
 'Ruffles Queso',
 'Sabritas Flamin Hot',
 'Sabritas Originales',
 'Salsa Botanera',
 'Sidral Mundet',
 'Sol Clamato',
 'Sprite',
 'Te Frutos Rojos',
 'Te Manzanilla Jengibre',
 'Te Relax',
 'Te Vainilla',
 'Tecate Light',
 'Tequila + Mezclador',
 'Tequila 7 Leguas Blanco',
 'Tequila 7 Leguas Reposado',
 'Tinto',
 'Topochico Seltzer Fresa-Guayaba',
 'Topochico Seltzer Mango',
 'Tostitos',
 'Tostitos Nachos Con Dip',
 'Ultra Seltzer Frambuesa',
 'Vino Blanco Cria Cuervos ',
 'Vino Tinto Cria Cuervos',
 'Vino Tinto Sangre De Toro',
 'Xx Lager',
 'Xx Ultra']]

In [8]:
X = X.to_numpy()
y = y.to_numpy()

In [14]:
X

Unnamed: 0,Aeronave,DepartureStation,ArrivalStation,Destination_Type,Origin_Type,Capacity,Passengers,Bookings,STD_hour,STA_hour,STD_day_of_week,STA_day_of_week
0,XA-VBY,AT,AZ,Ciudad Fronteriza,Ciudad Principal,240,229.0,157.0,11,14,3,3
1,XA-VIX,BM,AV,MX Amigos y Familia,Ciudad Fronteriza,186,197.0,109.0,0,4,0,0
2,XA-VAP,BA,AB,Playa,Ciudad Principal,186,162.0,90.0,8,9,4,4
3,XA-VAP,AW,BA,Ciudad Principal,Ciudad Principal,186,157.0,110.0,6,8,4,4
4,XA-VBY,AJ,AR,Playa,MX Amigos y Familia,240,183.0,125.0,17,18,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...
106086,XA-VAE,AT,BT,Playa,Ciudad Principal,186,191.0,119.0,12,16,2,2
106087,XA-VYF,AO,BT,Playa,Ciudad Principal,180,166.0,89.0,6,10,1,1
106088,XA-VAI,AW,BT,Playa,Ciudad Principal,180,142.0,54.0,10,13,3,3
106089,XA-VBK,BM,BT,Playa,Ciudad Fronteriza,240,189.0,86.0,7,14,4,4


In [11]:
categorical_features = ['Aeronave', 'DepartureStation', 'ArrivalStation', 'Destination_Type', 'Origin_Type']
numeric_features = ['Capacity', 'Passengers', 'Bookings', 'STD_hour', 'STA_hour', 'STD_day_of_week', 'STA_day_of_week']


In [10]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [12]:
# Modelo de regresión para múltiples salidas
# model = MultiOutputRegressor(RandomForestRegressor(n_estimators=3, verbose=2, n_jobs=-1))

from sklearn.experimental import enable_hist_gradient_boosting  # Requerido para HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor

model = MultiOutputRegressor(HistGradientBoostingRegressor(verbose=1))




In [13]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [14]:
# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Then you can fit the model
pipeline.fit(pd.DataFrame(X_train), y_train)


ValueError: A given column is not a column of the dataframe