In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import holidays

In [148]:
sales = pd.read_csv('data/sales_2023.csv')

In [106]:
# For United States holidays (change the country accordingly)
mx_holidays = holidays.Mexico()

# Function to determine if a date is a holiday
def is_holiday(date):
    return date in mx_holidays


def data_preprocessing(data):
    data['STD'] = pd.to_datetime(data['STD'])  # Make sure the STD column is in datetime format
    # Create a new column 'holidays' based on whether the date in 'STD' is a holiday
    data['holidays'] = data['STD'].apply(is_holiday).astype(int)  # Convert boolean to int (1 for True, 0 for False)
    data['month'] = data['STD'].dt.month
    data['day_of_week'] = data['STD'].dt.dayofweek
    data['hour'] = data['STD'].dt.hour
    data['minute'] = data['STD'].dt.minute  # Extracting minute
    # Define categorical features for CatBoost
    # Splitting the data
    X = data.drop(['Quantity', 'Perecedero', 'STA','STD','Flight_ID','Bookings','TotalSales', 'Aeronave'], axis=1)
    y = data['Quantity']
    return X, y



In [107]:
# Define categorical features for CatBoost
categorical_features = ['ProductType', 'ProductName', 'DepartureStation', 'ArrivalStation', 'Destination_Type', 'Origin_Type', 'tipo_vuelo', 'month', 'day_of_week', 'hour', 'minute', 'holidays']


In [108]:
X, y = data_preprocessing(sales)

In [111]:
# Creating a dictionary from the DataFrame columns
product_type = dict(zip(X['ProductName'], X['ProductType']))

In [9]:
# Train, validation, test split
X_train_valid, X_test, y_train_valid, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=2023,
    shuffle=True
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid,
    y_train_valid,
    test_size=0.25,
    random_state=2023,
    shuffle=True
)

# Convert to pandas DataFrame
X_train = pd.DataFrame(X_train, columns=X.columns)
X_valid = pd.DataFrame(X_valid, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
y_train = pd.DataFrame(y_train, columns=['Quantity'])
y_valid = pd.DataFrame(y_valid, columns=['Quantity'])
y_test = pd.DataFrame(y_test, columns=['Quantity'])


In [113]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(cat_features=categorical_features, verbose=500, iterations=1000)

# Train the model
model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

# Evaluate the model on the test set
predictions = model.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
mae = mean_absolute_error(y_test, predictions)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

Learning rate set to 0.155849
0:	learn: 2.4773177	test: 2.4700935	best: 2.4700935 (0)	total: 584ms	remaining: 9m 43s
500:	learn: 1.7375885	test: 1.7389409	best: 1.7389409 (500)	total: 3m	remaining: 2m 59s
999:	learn: 1.7180260	test: 1.7298595	best: 1.7298369 (997)	total: 6m 13s	remaining: 0us

bestTest = 1.729836901
bestIteration = 997

Shrink model to first 998 iterations.
RMSE: 1.7330563298283155
MAE: 1.0762447970257485


In [114]:
# Get feature importances
feature_importances = model.get_feature_importance(prettified=True)
print("Feature Importances:")
print(feature_importances)

Feature Importances:
          Feature Id  Importances
0        ProductName    44.110582
1     ArrivalStation    14.832251
2   DepartureStation    12.944794
3               hour     9.404913
4         Passengers     6.874343
5              month     3.147216
6         tipo_vuelo     2.597981
7   Destination_Type     1.529699
8        ProductType     1.520505
9        Origin_Type     1.070854
10       day_of_week     1.070159
11            minute     0.733484
12          Capacity     0.144626
13          holidays     0.018590


In [13]:
from catboost import CatBoostRegressor, Pool, cv

# Prepare the Pool object
train_pool = Pool(X_train, y_train, cat_features=categorical_features)
valid_pool = Pool(X_valid, y_valid, cat_features=categorical_features)

# Initialize a new CatBoostRegressor instance specifically for tuning
tuning_model = CatBoostRegressor(loss_function='RMSE', verbose=500)

# Define the parameter grid
param_grid = {
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# Perform grid search
results = tuning_model.grid_search(param_grid, train_pool, cv=3, partition_random_seed=42, stratified=False, verbose=True, plot=True)

# Best parameters
print("Best Parameters:", results['params'])

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 3.5709011	test: 3.5799370	best: 3.5799370 (0)	total: 437ms	remaining: 7m 16s
500:	learn: 1.8408385	test: 1.8321095	best: 1.8321095 (500)	total: 2m 17s	remaining: 2m 16s
999:	learn: 1.8029078	test: 1.7907249	best: 1.7907249 (999)	total: 5m 42s	remaining: 0us

bestTest = 1.790724892
bestIteration = 999

0:	loss: 1.7907249	best: 1.7907249 (0)	total: 5m 42s	remaining: 2h 28m 31s
0:	learn: 3.4818063	test: 3.4903317	best: 3.4903317 (0)	total: 980ms	remaining: 16m 19s
500:	learn: 1.7724311	test: 1.7621077	best: 1.7621077 (500)	total: 3m 14s	remaining: 3m 13s
999:	learn: 1.7517413	test: 1.7468925	best: 1.7468925 (999)	total: 5m 52s	remaining: 0us

bestTest = 1.746892522
bestIteration = 999

1:	loss: 1.7468925	best: 1.7468925 (1)	total: 11m 35s	remaining: 2h 24m 51s
0:	learn: 3.3725306	test: 3.3804259	best: 3.3804259 (0)	total: 586ms	remaining: 9m 45s
500:	learn: 1.7529916	test: 1.7478288	best: 1.7478288 (500)	total: 2m 41s	remaining: 2m 40s
999:	learn: 1.7308238	test: 1.7362405	best:

In [None]:
# Define your optimal parameters
params = {
    'iterations': 1000,
    'depth': 10,
    'learning_rate': 0.1,
    'l2_leaf_reg': 5,
    'loss_function': 'RMSE',
    'verbose': 100,
    'random_seed': 42
}

# Prepare the complete dataset as a Pool, including categorical features if any
full_data = Pool(data=X, label=y, cat_features=categorical_features)

# Execute cross-validation
cv_results = cv(
    params=params,
    pool=full_data,
    fold_count=5,
    type='Classical',  # Choose 'Classical' or 'TimeSeries' based on your data nature
    shuffle=True,
    partition_random_seed=42,
    plot=True,
    stratified=False,
    verbose=True
)

# Extract and print results
best_cv_score = np.min(cv_results['test-RMSE-mean'])
print(f'Best CV RMSE: {best_cv_score}')


In [123]:
# Initialize the model with the best parameters
final_model = CatBoostRegressor(
    iterations=1000,
    depth=10,
    learning_rate=0.1,
    l2_leaf_reg=5,
    loss_function='RMSE',
    verbose=100,
    random_seed=42
)

# Train the model on the full dataset
final_model.fit(X, y, cat_features=categorical_features)

0:	learn: 2.5161162	total: 1.06s	remaining: 17m 39s
100:	learn: 1.7520803	total: 2m 15s	remaining: 20m 7s
200:	learn: 1.7274859	total: 5m	remaining: 19m 56s
300:	learn: 1.7100155	total: 7m 59s	remaining: 18m 32s
400:	learn: 1.6961172	total: 10m 53s	remaining: 16m 15s
500:	learn: 1.6864413	total: 13m 52s	remaining: 13m 49s
600:	learn: 1.6771669	total: 16m 55s	remaining: 11m 13s
700:	learn: 1.6681314	total: 20m 20s	remaining: 8m 40s
800:	learn: 1.6591236	total: 23m 6s	remaining: 5m 44s
900:	learn: 1.6523933	total: 26m 28s	remaining: 2m 54s
999:	learn: 1.6455847	total: 29m 45s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x328e213d0>

In [15]:
import pickle

# Save the model using pickle
with open('app/sales_catboost-0.1.0.pkl', 'wb') as f:
    pickle.dump(final_model, f)

In [166]:
data_2024 = pd.read_csv('data/data_2024_pred.csv')
data_2024_fechas = data_2024.copy()
data_2024['Passengers'] = data_2024['Predicted_Passengers']

drop = ['Predicted_Passengers', 'Bookings']
data_2024.drop(drop, axis=1, inplace=True)

In [167]:
def data_preprocessing_pred(data):
    data['STD'] = pd.to_datetime(data['STD'])  # Make sure the STD column is in datetime format
    # Create a new column 'holidays' based on whether the date in 'STD' is a holiday
    data['holidays'] = data['STD'].apply(is_holiday).astype(int)  # Convert boolean to int (1 for True, 0 for False)
    data['month'] = data['STD'].dt.month
    data['day_of_week'] = data['STD'].dt.dayofweek
    data['hour'] = data['STD'].dt.hour
    data['minute'] = data['STD'].dt.minute  # Extracting minute
    # Define categorical features for CatBoost
    # Splitting the data
    X = data.drop(['STA','Flight_ID', 'Aeronave'], axis=1)
    return X

In [168]:
data_2024 = data_preprocessing_pred(data_2024)

In [169]:
products = ['Carne Seca Habanero', 'Cheetos', 'Ruffles Queso',
       'Coca Sin Azucar', 'Jack And Coke', 'Sprite', 'Nissin Res',
       'Tecate Light', 'Mafer Sin Sal', 'Coca Cola Regular',
       'Ron Bacardi', 'Arcoiris', 'Cafe 19 Chiapas',
       'Sabritas Originales', 'Xx Lager', 'Jugo De Manzana',
       'Agua Natural 600 Ml', 'Cafe Costa', 'Amstel Ultra',
       'Panini Clasico', 'Fanta De Naranja', 'Nishikawa Japones',
       'Sabritas Flamin Hot', 'Jw Red Label ', 'Ciel Mineralizada',
       'Jugo De Mango', 'Sidral Mundet', 'Coca Cola Dieta', 'Chokis',
       'Tostitos', 'Mega Cuerno Clasico', 'Doritos Nacho',
       'Fritos Limon Y Sal', 'Corajillo Baileys ',
       'Quaker Avena Frutos Rojos', 'Nutty Berry Mix',
       'Heineken Original', 'Vino Tinto Sangre De Toro', 'Luxury Nut Mix',
       'Salsa Botanera', 'Jw Red Label', 'Nissin Picante',
       'Heineken Silver', 'Leche De Fresa Sc', 'Cheetos Flamin Hot',
       'Emperador Chocolate', 'Cuerno Clasico De Pavo',
       'Nissin Dark Dragon', 'Nissin Fuego', 'Panini Integral',
       'Cafe 19 Capuchino', 'Te Manzanilla Jengibre', 'Xx Ultra',
       'Sol Clamato', 'Go Nuts', 'Muffin Integral', 'Dip De Queso',
       'Hazme Doble', 'Baileys', 'Nishikawa Salado', 'Corajillo',
       'Quaker Granola', 'Tequila 7 Leguas Reposado',
       'Emperador Vainilla', 'Leche De Chocolate Sc',
       'Arandano Mango Mix', 'Topochico Seltzer Mango', 'Rancheritos',
       'Baileys ', 'Protein Adventure', 'Tequila 7 Leguas Blanco',
       'Nueces De Arbol Mix', 'Cafe De Olla', 'Te Vainilla',
       'Tostitos Nachos Con Dip', 'Frutos Secos Enchilados', 'Hsbc-Viva',
       'Ultra Seltzer Frambuesa', 'Arandano', 'Te Frutos Rojos',
       'Vino Tinto Cria Cuervos', 'Carne Seca Original', 'Te Relax',
       'Vino Blanco Cria Cuervos ', 'Topochico Seltzer Fresa-Guayaba',
       'Galleta De Arandano Relleno De Q/Crema',
       'Galleta De Chispas De Chocolate', 'Promo Hsbc 1 Bebida Gratis',
       'Galleta De Chocolate', 'Cerveza Charter', 'Eco Holder',
       'Cafe 19 Cafe Clasico', 'Gomita Enchilada La Cueva', 'Maxi Combo',
       'Heineken 0', 'Combo Stl', 'Kacang Flaming Hot', 'Licor Charter',
       'Quaker Avena Moras', 'Quaker Natural Balance',
       'Nissin Limon Y Habanero']

In [170]:
# Repetir cada fila len(products) veces
repeated_df = data_2024.loc[data_2024.index.repeat(len(products))].reset_index(drop=True)

# Asignar productos a cada fila repetida
repeated_df['ProductName'] = products * len(data_2024)
repeated_df['ProductType'] = repeated_df['ProductName'].map(product_type)


In [171]:
date_col = repeated_df['STD']
repeated_df = repeated_df.drop(['STD'], axis=1)

In [172]:
cat_features = ['DepartureStation', 'ArrivalStation', 'Destination_Type', 'Origin_Type', 'tipo_vuelo', 'ProductName', 'ProductType']
for feature in cat_features:
    repeated_df[feature] = repeated_df[feature].astype('category')

repeated_df = repeated_df[['ProductType', 'ProductName', 'Capacity', 'DepartureStation',
       'ArrivalStation', 'Destination_Type', 'Origin_Type', 'Passengers',
       'tipo_vuelo', 'holidays', 'month', 'day_of_week', 'hour', 'minute']]

# Now, predict using the model
predictions = final_model.predict(repeated_df)


In [173]:
import numpy as np
predictions = np.floor(predictions)

In [177]:
repeated_df['Quantity'] = predictions
repeated_df['STD'] = date_col

In [178]:
repeated_df.shape

(1025049, 16)

In [180]:
repeated_df.to_csv('data/sales_predictions_2024.csv', index=False)