In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
import xgboost as xg
from xgboost import XGBRegressor


def one_hot_encode(
    df: pd.DataFrame,
    column: str,
    prefix: str,
) -> pd.DataFrame:

    df_ = df.copy()
    df_encoded = pd.get_dummies(df_[column], dtype="int")
    for col in df_encoded:
        df_[f"{prefix}_{col}"] = df_encoded[col]

    return df_.drop(columns=[column])

In [41]:
vuelos = pd.read_parquet("../resources/flights_clean.parquet")
ventas = pd.read_parquet("../resources/sales_clean.parquet")

In [42]:
# Merge a ambos dataframes
vuelos_ventas = pd.merge(ventas, vuelos, on="Flight_ID")
vuelos_ventas.drop(columns=["index", "Aeronave"], inplace=True)
vuelos_ventas

Unnamed: 0,Flight_ID,ProductType,ProductName,Quantity,TotalSales,DepartureStation,ArrivalStation,Destination_Type,Origin_Type,STD,STA,Capacity,Passengers,Bookings,Semana,Mes
0,a05290288259526edd3601160b10e1de,Botanas,Carne Seca Habanero,1,55.0,AW,AO,Ciudad Principal,Ciudad Principal,2023-08-28 19:20:00,2023-08-28 20:45:00,186,131.0,100.0,35,8
1,a05290288259526edd3601160b10e1de,Botanas,Cheetos,2,98.0,AW,AO,Ciudad Principal,Ciudad Principal,2023-08-28 19:20:00,2023-08-28 20:45:00,186,131.0,100.0,35,8
2,a05290288259526edd3601160b10e1de,Botanas,Ruffles Queso,6,294.0,AW,AO,Ciudad Principal,Ciudad Principal,2023-08-28 19:20:00,2023-08-28 20:45:00,186,131.0,100.0,35,8
3,a05290288259526edd3601160b10e1de,Refrescos,Coca Sin Azucar,2,96.0,AW,AO,Ciudad Principal,Ciudad Principal,2023-08-28 19:20:00,2023-08-28 20:45:00,186,131.0,100.0,35,8
4,a05290288259526edd3601160b10e1de,Licores,Jack And Coke,1,72.0,AW,AO,Ciudad Principal,Ciudad Principal,2023-08-28 19:20:00,2023-08-28 20:45:00,186,131.0,100.0,35,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095947,6113fc0448cd548cd7e7d3f76e7872c5,Perecederos,Cuerno Clasico De Pavo,5,575.0,AF,AW,Ciudad Principal,Ciudad Fronteriza,2023-11-02 23:10:00,2023-11-03 00:50:00,240,154.0,119.0,44,11
2095948,78b84bc241b924d6c89e171a34b0f60c,Perecederos,Cuerno Clasico De Pavo,1,115.0,AT,BP,MX Amigos y Familia,Ciudad Principal,2023-10-05 06:00:00,2023-10-05 07:05:00,240,188.0,150.0,40,10
2095949,2d7d30a16334ee2db84e5bc027a4a186,Licores,Vino Tinto Sangre De Toro,1,155.0,AO,BD,Playa,Ciudad Principal,2023-05-16 18:45:00,2023-05-16 19:35:00,180,149.0,112.0,20,5
2095950,7a56b895719a91ca6a98004cff956b7d,Licores,Vino Tinto Sangre De Toro,1,155.0,AL,AW,Ciudad Principal,MX Amigos y Familia,2023-02-15 07:00:00,2023-02-15 08:25:00,220,210.0,170.0,7,2


In [43]:
vuelos_ventas = vuelos_ventas.groupby("Flight_ID").agg(
    {
        "TotalSales": "sum",
        "Bookings": "mean",
        "Passengers": "mean",
        "Origin_Type": "first",
        "Destination_Type": "first",
        "STD": "first",
        "STA": "first",
        "DepartureStation": "first",
        "ArrivalStation": "first",
        "Semana": "first",
        "Mes": "first",
        "Capacity": "first",
    }
)
vuelos_ventas["STD"] = pd.to_datetime(vuelos_ventas["STD"])

vuelos_ventas["STA"] = pd.to_datetime(vuelos_ventas["STA"])

vuelos_ventas["Duracion"] = vuelos_ventas["STA"] - vuelos_ventas["STD"]

vuelos_ventas["Duracion"] = vuelos_ventas["Duracion"].dt.total_seconds() / 60

vuelos_ventas["Hora"] = vuelos_ventas["STD"].dt.hour

vuelos_ventas["Dia"] = vuelos_ventas["STD"].dt.day

# Cambiar tipos de destinos a one-hot encoding

destination_type = vuelos_ventas["Destination_Type"].map(

    {

        "Ciudad Principal": "destino_ciudadprincipal",

        "Playa": "destino_playa",

        "MX Amigos y Familia": "destino_amigosfamilia",

        "Ciudad Fronteriza": "destino_ciudadfronteriza",

        "Ecoturismo": "destino_ecoturismo",

    }

)

destination_type = pd.get_dummies(destination_type, dtype="int")

vuelos_ventas = (

    pd.concat([vuelos_ventas, destination_type], axis=1)

    if "Destination_Type" in vuelos_ventas.columns
    else vuelos_ventas

)

vuelos_ventas = (

    vuelos_ventas.drop(columns=["Destination_Type"])

    if "Destination_Type" in vuelos_ventas.columns
    else vuelos_ventas

)

origen_type = vuelos_ventas["Origin_Type"].map(

    {

        "Ciudad Principal": "origen_ciudadprincipal",

        "Playa": "origen_playa",

        "MX Amigos y Familia": "origen_amigosfamilia",

        "Ciudad Fronteriza": "origen_ciudadfronteriza",

        "Ecoturismo": "origen_ecoturismo",

    }

)

origen_type = pd.get_dummies(origen_type, dtype="int")

vuelos_ventas = (

    pd.concat([vuelos_ventas, origen_type], axis=1)

    if "Origin_Type" in vuelos_ventas.columns
    else vuelos_ventas

)

vuelos_ventas = (

    vuelos_ventas.drop(columns=["Origin_Type"])
    if "Origin_Type" in vuelos_ventas.columns
    else vuelos_ventas

)

vuelos_ventas["Book_Pass"] = vuelos_ventas["Bookings"] / vuelos_ventas["Passengers"]

vuelos_ventas.drop(columns=["STD", "STA"], inplace=True)

vuelos_ventas = one_hot_encode(vuelos_ventas, "DepartureStation", "departure")

vuelos_ventas = one_hot_encode(vuelos_ventas, "ArrivalStation", "arrival")

In [44]:
# Hacemos el modelo de regesion
X = vuelos_ventas.drop(columns=["TotalSales"])
y = vuelos_ventas["TotalSales"]

In [45]:
x_train, x_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0,
)

eval_set = [(x_test, y_test)]

train_dmatrix = xg.DMatrix(data=x_train, label=y_train)
test_dmatrix = xg.DMatrix(data=x_test, label=y_test)

# xgb_r = XGBRegressor(n_estimators=10, seed=123)
xgb_r = XGBRegressor(learning_rate=0.3, n_estimators=250)
xgb_r.fit(x_train, y_train, eval_set=eval_set, verbose=True)
y_pred = xgb_r.predict(x_test)

[0]	validation_0-rmse:2481.74894
[1]	validation_0-rmse:2198.97013
[2]	validation_0-rmse:2033.32798
[3]	validation_0-rmse:1907.75976
[4]	validation_0-rmse:1834.64862
[5]	validation_0-rmse:1773.85720
[6]	validation_0-rmse:1717.03975
[7]	validation_0-rmse:1690.80258
[8]	validation_0-rmse:1672.18555
[9]	validation_0-rmse:1654.92034
[10]	validation_0-rmse:1639.77394
[11]	validation_0-rmse:1630.80127
[12]	validation_0-rmse:1618.72050
[13]	validation_0-rmse:1610.92381
[14]	validation_0-rmse:1605.80254
[15]	validation_0-rmse:1602.88202
[16]	validation_0-rmse:1596.54765
[17]	validation_0-rmse:1590.12355
[18]	validation_0-rmse:1583.64605
[19]	validation_0-rmse:1581.56075
[20]	validation_0-rmse:1579.30407
[21]	validation_0-rmse:1577.54724
[22]	validation_0-rmse:1576.87235
[23]	validation_0-rmse:1571.83939
[24]	validation_0-rmse:1567.25605
[25]	validation_0-rmse:1565.76184
[26]	validation_0-rmse:1564.85253
[27]	validation_0-rmse:1562.18313
[28]	validation_0-rmse:1561.84044
[29]	validation_0-rmse:1

In [46]:
# Evaluamos el modelo
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R2 Score: 0.7157283740835436
Mean Absolute Error: 976.2706119747071
Mean Squared Error: 2366687.9791630553
Root Mean Squared Error: 1538.4043613962667


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


## **Modelo cantidad**

In [47]:
# Merge a ambos dataframes
vuelos_ventas = pd.merge(ventas, vuelos, on="Flight_ID")
vuelos_ventas.drop(columns=["index", "Aeronave"], inplace=True)

In [48]:
vuelos_ventas["STD"] = pd.to_datetime(vuelos_ventas["STD"])
vuelos_ventas["STA"] = pd.to_datetime(vuelos_ventas["STA"])
vuelos_ventas["Duracion"] = vuelos_ventas["STA"] - vuelos_ventas["STD"]
vuelos_ventas["Duracion"] = vuelos_ventas["Duracion"].dt.total_seconds() / 60
vuelos_ventas["Hora"] = vuelos_ventas["STD"].dt.hour
vuelos_ventas["Dia"] = vuelos_ventas["STD"].dt.day
# Cambiar tipos de destinos a one-hot encoding
destination_type = vuelos_ventas["Destination_Type"].map(
    {
        "Ciudad Principal": "destino_ciudadprincipal",
        "Playa": "destino_playa",
        "MX Amigos y Familia": "destino_amigosfamilia",
        "Ciudad Fronteriza": "destino_ciudadfronteriza",
        "Ecoturismo": "destino_ecoturismo",
    }
)
destination_type = pd.get_dummies(destination_type, dtype="int")
vuelos_ventas = (
    pd.concat([vuelos_ventas, destination_type], axis=1)
    if "Destination_Type" in vuelos_ventas.columns
    else vuelos_ventas
)
vuelos_ventas = (
    vuelos_ventas.drop(columns=["Destination_Type"])
    if "Destination_Type" in vuelos_ventas.columns
    else vuelos_ventas
)
origen_type = vuelos_ventas["Origin_Type"].map(
    {
        "Ciudad Principal": "origen_ciudadprincipal",
        "Playa": "origen_playa",
        "MX Amigos y Familia": "origen_amigosfamilia",
        "Ciudad Fronteriza": "origen_ciudadfronteriza",
        "Ecoturismo": "origen_ecoturismo",
    }
)
origen_type = pd.get_dummies(origen_type, dtype="int")
vuelos_ventas = (
    pd.concat([vuelos_ventas, origen_type], axis=1)
    if "Origin_Type" in vuelos_ventas.columns
    else vuelos_ventas
)
vuelos_ventas = (
    vuelos_ventas.drop(columns=["Origin_Type"])
    if "Origin_Type" in vuelos_ventas.columns
    else vuelos_ventas
)
vuelos_ventas["Book_Pass"] = vuelos_ventas["Bookings"] / vuelos_ventas["Passengers"]
vuelos_ventas.drop(columns=["STD", "STA"], inplace=True)
vuelos_ventas = one_hot_encode(vuelos_ventas, "DepartureStation", "departure")
vuelos_ventas = one_hot_encode(vuelos_ventas, "ArrivalStation", "arrival")

In [49]:
vuelos_ventas.drop(columns=["Flight_ID", "ProductType", "TotalSales"], inplace=True)

In [50]:
product_dict = {}
for i in vuelos_ventas["ProductName"].unique():
    product_dict[i] = len(product_dict)
vuelos_ventas["ProductName"] = vuelos_ventas["ProductName"].map(product_dict)

In [51]:
modelo2 = vuelos_ventas[vuelos_ventas["ProductName"] == 0]

In [52]:
X = modelo2.drop(columns=["Quantity"])
y = modelo2["Quantity"]

In [53]:
x_train, x_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0,
)

eval_set = [(x_test, y_test)]

train_dmatrix = xg.DMatrix(data=x_train, label=y_train)
test_dmatrix = xg.DMatrix(data=x_test, label=y_test)

# xgb_r = XGBRegressor(n_estimators=10, seed=123)
xgb_r = XGBRegressor(learning_rate=0.3, n_estimators=250)
xgb_r.fit(x_train, y_train, eval_set=eval_set, verbose=True)
y_pred = xgb_r.predict(x_test)

[0]	validation_0-rmse:0.77143
[1]	validation_0-rmse:0.77220
[2]	validation_0-rmse:0.77227
[3]	validation_0-rmse:0.77110
[4]	validation_0-rmse:0.77402
[5]	validation_0-rmse:0.77702
[6]	validation_0-rmse:0.77648
[7]	validation_0-rmse:0.78009
[8]	validation_0-rmse:0.78139
[9]	validation_0-rmse:0.78207
[10]	validation_0-rmse:0.78320
[11]	validation_0-rmse:0.78437
[12]	validation_0-rmse:0.78493
[13]	validation_0-rmse:0.78782
[14]	validation_0-rmse:0.78636
[15]	validation_0-rmse:0.78753
[16]	validation_0-rmse:0.78765
[17]	validation_0-rmse:0.78886
[18]	validation_0-rmse:0.79045
[19]	validation_0-rmse:0.79078
[20]	validation_0-rmse:0.78984
[21]	validation_0-rmse:0.79164
[22]	validation_0-rmse:0.79280
[23]	validation_0-rmse:0.79248
[24]	validation_0-rmse:0.79195
[25]	validation_0-rmse:0.79265
[26]	validation_0-rmse:0.79451
[27]	validation_0-rmse:0.79519
[28]	validation_0-rmse:0.79500
[29]	validation_0-rmse:0.79571
[30]	validation_0-rmse:0.79759
[31]	validation_0-rmse:0.79950
[32]	validation_0-

In [54]:
# Evaluamos el modelo
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R2 Score: -0.16427295425310406
Mean Absolute Error: 0.5320833884361313
Mean Squared Error: 0.6917398586424164
Root Mean Squared Error: 0.831708998774461


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
