In [60]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import holidays
import pickle


In [61]:
data = pd.read_csv('data/flights_2023_clean.csv')

In [None]:
# For United States holidays (change the country accordingly)
mx_holidays = holidays.Mexico()

# Function to determine if a date is a holiday
def is_holiday(date):
    return date in mx_holidays


def data_preprocessing(data):
    data['STD'] = pd.to_datetime(data['STD'])  # Make sure the STD column is in datetime format
    # Create a new column 'holidays' based on whether the date in 'STD' is a holiday
    data['holidays'] = data['STD'].apply(is_holiday).astype(int)  # Convert boolean to int (1 for True, 0 for False)
    data['STD'] = pd.to_datetime(data['STD'])
    data['month'] = data['STD'].dt.month
    data['day_of_week'] = data['STD'].dt.dayofweek
    data['hour'] = data['STD'].dt.hour
    data['minute'] = data['STD'].dt.minute  # Extracting minute
    # Define categorical features for CatBoost
    # Splitting the data
    X = data.drop(['Passengers', 'Flight_ID', 'STD', 'STA', 'Bookings'], axis=1)
    y = data['Passengers']
    return X, y



In [64]:
categorical_features = ['Aeronave', 'DepartureStation', 'ArrivalStation', 'Destination_Type', 'Origin_Type', 'tipo_vuelo', 'month', 'day_of_week', 'hour', 'minute', 'holidays']


X, y = data_preprocessing(data)

# Train, validation, test split
X_train_valid, X_test, y_train_valid, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=2023, 
    shuffle=True
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    test_size=0.25, 
    random_state=2023, 
    shuffle=True
)

# Convert to pandas DataFrame
X_train = pd.DataFrame(X_train, columns=X.columns)
X_valid = pd.DataFrame(X_valid, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
y_train = pd.DataFrame(y_train, columns=['Passengers'])
y_valid = pd.DataFrame(y_valid, columns=['Passengers'])
y_test = pd.DataFrame(y_test, columns=['Passengers'])

# Initialize CatBoostRegressor
model = CatBoostRegressor(cat_features=categorical_features, verbose=500, iterations=1000)

# Train the model
model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

# Evaluate the model on the test set
predictions = model.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
mae = mean_absolute_error(y_test, predictions)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')


Learning rate set to 0.099517
0:	learn: 33.6400726	test: 33.7666889	best: 33.7666889 (0)	total: 25.4ms	remaining: 25.3s
500:	learn: 21.1668984	test: 21.0915517	best: 21.0915517 (500)	total: 11.2s	remaining: 11.1s
999:	learn: 20.4066118	test: 20.7792852	best: 20.7792852 (999)	total: 19.9s	remaining: 0us

bestTest = 20.77928521
bestIteration = 999

RMSE: 20.675954724306376
MAE: 15.156876521236503


In [65]:
# Get feature importances
feature_importances = model.get_feature_importance(prettified=True)
print("Feature Importances:")
print(feature_importances)


Feature Importances:
          Feature Id  Importances
0           Capacity    30.213672
1   DepartureStation    14.694154
2     ArrivalStation    12.944274
3        Origin_Type    10.208220
4              month     9.558894
5        day_of_week     6.753837
6               hour     6.034929
7   Destination_Type     4.228986
8           Aeronave     2.092721
9             minute     2.059588
10        tipo_vuelo     0.891429
11          holidays     0.319296


In [66]:
from catboost import CatBoostRegressor, Pool, cv

# Prepare the Pool object
train_pool = Pool(X_train, y_train, cat_features=categorical_features)
valid_pool = Pool(X_valid, y_valid, cat_features=categorical_features)

# Initialize a new CatBoostRegressor instance specifically for tuning
tuning_model = CatBoostRegressor(loss_function='RMSE', verbose=500)

# Define the parameter grid
param_grid = {
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# Perform grid search
results = tuning_model.grid_search(param_grid, train_pool, cv=3, partition_random_seed=42, stratified=False, verbose=True, plot=True)

# Best parameters
print("Best Parameters:", results['params'])


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 184.5516686	test: 184.3021378	best: 184.3021378 (0)	total: 14.2ms	remaining: 14.2s
500:	learn: 24.1945329	test: 23.7069293	best: 23.7069293 (500)	total: 5.5s	remaining: 5.48s
999:	learn: 22.9399568	test: 22.3864605	best: 22.3864605 (999)	total: 13.3s	remaining: 0us

bestTest = 22.38646051
bestIteration = 999

0:	loss: 22.3864605	best: 22.3864605 (0)	total: 13.3s	remaining: 5m 46s
0:	learn: 177.2836493	test: 177.0372854	best: 177.0372854 (0)	total: 11.9ms	remaining: 11.9s
500:	learn: 21.8744935	test: 21.6158867	best: 21.6158867 (500)	total: 7.87s	remaining: 7.84s
999:	learn: 21.1509962	test: 21.2843462	best: 21.2843462 (999)	total: 17.5s	remaining: 0us

bestTest = 21.2843462
bestIteration = 999

1:	loss: 21.2843462	best: 21.2843462 (1)	total: 30.9s	remaining: 6m 25s
0:	learn: 168.2102864	test: 167.9677499	best: 167.9677499 (0)	total: 24.3ms	remaining: 24.2s
500:	learn: 21.2027547	test: 21.3151937	best: 21.3150793 (499)	total: 8.7s	remaining: 8.66s
999:	learn: 20.2742720	test: 

In [127]:
# Define your optimal parameters
params = {
    'iterations': 1000,
    'depth': 8,
    'learning_rate': 0.1,
    'l2_leaf_reg': 1,
    'loss_function': 'RMSE',
    'verbose': 100,
    'random_seed': 42
}

# Prepare the complete dataset as a Pool, including categorical features if any
full_data = Pool(data=X, label=y, cat_features=categorical_features)

# Execute cross-validation
cv_results = cv(
    params=params,
    pool=full_data,
    fold_count=5,
    type='Classical',  # Choose 'Classical' or 'TimeSeries' based on your data nature
    shuffle=True,
    partition_random_seed=42,
    plot=True,
    stratified=False,
    verbose=True
)

# Extract and print results
best_cv_score = np.min(cv_results['test-RMSE-mean'])
print(f'Best CV RMSE: {best_cv_score}')


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 168.2291873	test: 168.1040226	best: 168.1040226 (0)	total: 71.8ms	remaining: 1m 11s
1:	learn: 152.1345457	test: 152.0111807	best: 152.0111807 (1)	total: 104ms	remaining: 52s
2:	learn: 137.5030094	test: 137.3876625	best: 137.3876625 (2)	total: 139ms	remaining: 46.1s
3:	learn: 124.3957272	test: 124.2885163	best: 124.2885163 (3)	total: 169ms	remaining: 42.1s
4:	learn: 112.6680135	test: 112.5676870	best: 112.5676870 (4)	total: 185ms	remaining: 36.9s
5:	learn: 102.1844979	test: 102.0911901	best: 102.0911901 (5)	total: 213ms	remaining: 35.2s
6:	learn: 92.8290727	test: 92.7415757	best: 92.7415757 (6)	total: 223ms	remaining: 31.6s
7:	learn: 84.4955536	test: 84.4139037	best: 84.4139037 (7)	total: 243ms	remaining: 30.2s
8:	learn: 77.0868159	test: 77.0110005	best: 77.0110005 (8)	total: 269ms	remaining: 29.7s
9:	learn: 70.4738232	test: 70.3539956	best: 70.3539956 (9)	total: 313ms	remaining: 30.9s
10:	learn: 64.6949213	test: 64.5804782	best: 64.5804782 (10)	total: 3

In [128]:
# Initialize the model with the best parameters
final_model = CatBoostRegressor(
    iterations=1000,
    depth=10,
    learning_rate=0.1,
    l2_leaf_reg=3,
    loss_function='RMSE',
    verbose=100,
    random_seed=42
)

# Train the model on the full dataset
final_model.fit(X, y, cat_features=categorical_features)


0:	learn: 33.4037983	total: 99.4ms	remaining: 1m 39s
100:	learn: 20.7744111	total: 6.69s	remaining: 59.6s
200:	learn: 19.7325378	total: 13.3s	remaining: 53s
300:	learn: 19.0336617	total: 20.7s	remaining: 48.1s
400:	learn: 18.4469709	total: 27.4s	remaining: 41s
500:	learn: 17.9726154	total: 34s	remaining: 33.9s
600:	learn: 17.5534546	total: 41.3s	remaining: 27.4s
700:	learn: 17.1739509	total: 49.6s	remaining: 21.2s
800:	learn: 16.8028533	total: 56.6s	remaining: 14.1s
900:	learn: 16.4115538	total: 1m 3s	remaining: 7.01s
999:	learn: 16.0428745	total: 1m 10s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x17b7edd50>

In [129]:
data_2024 = pd.read_csv('data/data_2024.csv')

def data_preprocessing_pred(data):
    data['STD'] = pd.to_datetime(data['STD'])  # Make sure the STD column is in datetime format
    # Create a new column 'holidays' based on whether the date in 'STD' is a holiday
    data['holidays'] = data['STD'].apply(is_holiday).astype(int)  # Convert boolean to int (1 for True, 0 for False)
    data['STD'] = pd.to_datetime(data['STD'])
    data['month'] = data['STD'].dt.month
    data['day_of_week'] = data['STD'].dt.dayofweek
    data['hour'] = data['STD'].dt.hour
    data['minute'] = data['STD'].dt.minute  # Extracting minute
    # Define categorical features for CatBoost
    # Splitting the data
    X = data.drop(['Passengers', 'Flight_ID', 'STD', 'STA', 'Bookings'], axis=1)
    return X


In [130]:
def imput_aeronave(data):
    # Dropping rows with any missing values except in 'Aeronave' column
    data_cleaned = data.dropna(subset=['DepartureStation', 'ArrivalStation', 'Destination_Type', 'Origin_Type'])

    # Analyze the distribution of 'Aeronave'
    aeronave_counts = data_cleaned['Aeronave'].value_counts()

    # Check if certain aircraft types are more likely to be used on specific routes or at specific times
    route_aeronave = data_cleaned.groupby(['DepartureStation', 'ArrivalStation', 'tipo_vuelo'])['Aeronave'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else "Unknown")
    time_aeronave = data_cleaned.groupby(['STD', 'tipo_vuelo'])['Aeronave'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else "Unknown")

    def impute_aeronave(row, route_aeronave, time_aeronave):
        # Try to impute based on route and flight type first
        if pd.isna(row['Aeronave']):
            route_key = (row['DepartureStation'], row['ArrivalStation'], row['tipo_vuelo'])
            if route_key in route_aeronave:
                return route_aeronave[route_key]
            
            # If route and type-based imputation fails, try time and type-based
            time_key = (row['STD'], row['tipo_vuelo'])
            if time_key in time_aeronave:
                return time_aeronave[time_key]
            
            # If both fail, return 'Unknown' (or could use the overall most common aircraft)
            return 'Unknown'
        else:
            return row['Aeronave']
    # Apply the imputation function to the data
    data_cleaned['Aeronave'] = data_cleaned.apply(impute_aeronave, axis=1, args=(route_aeronave, time_aeronave))
    
    return data_cleaned

In [131]:
data_2024_imputed = imput_aeronave(data_2024)

In [132]:
new_data = data_preprocessing_pred(data_2024_imputed)

In [133]:
new_data = new_data[new_data.month == 1]

In [134]:
new_data

Unnamed: 0,Aeronave,Capacity,DepartureStation,ArrivalStation,Destination_Type,Origin_Type,tipo_vuelo,holidays,month,day_of_week,hour,minute
17,XA-VXG,240,BM,AT,Ciudad Principal,Ciudad Fronteriza,Largo,0,1,6,18,25
30,XA-VYD,180,AO,AW,Ciudad Principal,Ciudad Principal,Corto,0,1,1,18,55
52,XA-VBW,220,AW,AD,MX Amigos y Familia,Ciudad Principal,Corto,0,1,1,15,5
64,XA-VAK,186,AO,BP,MX Amigos y Familia,Ciudad Principal,Corto,0,1,4,6,15
71,XA-VBJ,220,AK,BM,Ciudad Fronteriza,Playa,Corto,0,1,5,15,45
...,...,...,...,...,...,...,...,...,...,...,...,...
116671,XA-VXC,240,BB,AW,Ciudad Principal,Ecoturismo,Corto,0,1,6,22,15
116677,XA-VBB,240,BT,AW,Ciudad Principal,Playa,Corto,0,1,5,18,45
116678,XA-VAP,186,AD,BM,Ciudad Fronteriza,MX Amigos y Familia,Corto,0,1,2,6,55
116689,XA-VXJ,240,AO,BM,Ciudad Fronteriza,Ciudad Principal,Corto,0,1,5,15,5


In [135]:
# Load the model
with open('final_catboost_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Now you can use loaded_model to make predictions or further analysis
predictions = loaded_model.predict(new_data)

In [136]:
len(predictions)

10149

In [137]:
data_2024['STD'] = pd.to_datetime(data_2024['STD'])  # Make sure the STD column is in datetime format

data_2024_pred = data_2024[data_2024['STD'].dt.month == 1]


In [138]:
data_2024_pred['Predicted_Passengers'] = predictions




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [139]:
data_2024_pred

Unnamed: 0,Flight_ID,Aeronave,Capacity,DepartureStation,ArrivalStation,Destination_Type,Origin_Type,STD,STA,Passengers,Bookings,tipo_vuelo,Predicted_Passengers
17,000770f5475f74331c5f47970bae3e5e,XA-VXG,240,BM,AT,Ciudad Principal,Ciudad Fronteriza,2024-01-07 18:25:00,2024-01-07 23:50:00,,,Largo,212.417909
30,000ccecdaefd0bb3dcf9e3bfedd28416,XA-VYD,180,AO,AW,Ciudad Principal,Ciudad Principal,2024-01-30 18:55:00,2024-01-30 20:20:00,,,Corto,164.693556
52,0018fe498b5b61e2b320e880618de010,XA-VBW,220,AW,AD,MX Amigos y Familia,Ciudad Principal,2024-01-02 15:05:00,2024-01-02 16:20:00,,,Corto,134.979090
64,0021370a71862fc079c2e362ac6038cc,XA-VAK,186,AO,BP,MX Amigos y Familia,Ciudad Principal,2024-01-26 06:15:00,2024-01-26 07:45:00,,,Corto,162.779407
71,0024377f7159c78ede2dada632e2ceef,XA-VBJ,220,AK,BM,Ciudad Fronteriza,Playa,2024-01-06 15:45:00,2024-01-06 17:40:00,,,Corto,212.024828
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116671,ffddb38f623a5de06d039215d7ce1004,XA-VXC,240,BB,AW,Ciudad Principal,Ecoturismo,2024-01-28 22:15:00,2024-01-29 00:15:00,,,Corto,204.021340
116677,ffdfc9bfd911d3d5e60d8628d2e192a5,XA-VBB,240,BT,AW,Ciudad Principal,Playa,2024-01-06 18:45:00,2024-01-06 20:20:00,,,Corto,175.707605
116678,ffe09b4fe9baf663de984507d178a589,XA-VAP,186,AD,BM,Ciudad Fronteriza,MX Amigos y Familia,2024-01-03 06:55:00,2024-01-03 08:10:00,,,Corto,184.866410
116689,ffe8d3b347d9e43a613e5084de3d0d22,XA-VXJ,240,AO,BM,Ciudad Fronteriza,Ciudad Principal,2024-01-13 15:05:00,2024-01-13 16:10:00,,,Corto,195.282163


In [140]:
data_2024_pred.to_csv('data/data_2024_pred.csv', index=False)

In [143]:
data.dtypes

Flight_ID                   object
Aeronave                    object
Capacity                     int64
DepartureStation            object
ArrivalStation              object
Destination_Type            object
Origin_Type                 object
STD                 datetime64[ns]
STA                         object
Passengers                 float64
Bookings                   float64
tipo_vuelo                  object
dtype: object

In [153]:
plane_XAVBV

Unnamed: 0,Flight_ID,ProductType,ProductName,Quantity,TotalSales,Aeronave,Capacity,DepartureStation,ArrivalStation,Destination_Type,Origin_Type,STD,STA,Passengers,Bookings,tipo_vuelo,Perecedero
1913939,000163f0df9cbfc35c4c06645ec512f6,Perecederos,Panini Clasico,1,115.0,XA-VBV,220,AW,BF,MX Amigos y Familia,Ciudad Principal,2023-10-29 06:05:00,2023-10-29 07:30:00,173.0,138.0,Corto,Perecedero
1913935,000163f0df9cbfc35c4c06645ec512f6,Perecederos,Panini Integral,1,115.0,XA-VBV,220,AW,BF,MX Amigos y Familia,Ciudad Principal,2023-10-29 06:05:00,2023-10-29 07:30:00,173.0,138.0,Corto,Perecedero
1913936,000163f0df9cbfc35c4c06645ec512f6,Bebidas Calientes,Cafe Costa,1,45.0,XA-VBV,220,AW,BF,MX Amigos y Familia,Ciudad Principal,2023-10-29 06:05:00,2023-10-29 07:30:00,173.0,138.0,Corto,No Perecedero
1913937,000163f0df9cbfc35c4c06645ec512f6,Botanas,Sabritas Originales,1,49.0,XA-VBV,220,AW,BF,MX Amigos y Familia,Ciudad Principal,2023-10-29 06:05:00,2023-10-29 07:30:00,173.0,138.0,Corto,No Perecedero
1913940,000163f0df9cbfc35c4c06645ec512f6,Bebidas Calientes,Cafe 19 Capuchino,1,45.0,XA-VBV,220,AW,BF,MX Amigos y Familia,Ciudad Principal,2023-10-29 06:05:00,2023-10-29 07:30:00,173.0,138.0,Corto,No Perecedero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168672,136e775453bcf1788213da1bb6323882,Sopas,Nissin Fuego,1,45.0,XA-VBV,220,BN,AW,Ciudad Principal,MX Amigos y Familia,2023-10-29 22:10:00,2023-10-29 23:50:00,183.0,121.0,Corto,No Perecedero
168673,136e775453bcf1788213da1bb6323882,Refrescos,Coca Cola Regular,3,144.0,XA-VBV,220,BN,AW,Ciudad Principal,MX Amigos y Familia,2023-10-29 22:10:00,2023-10-29 23:50:00,183.0,121.0,Corto,No Perecedero
168674,136e775453bcf1788213da1bb6323882,Botanas,Nishikawa Japones,3,135.0,XA-VBV,220,BN,AW,Ciudad Principal,MX Amigos y Familia,2023-10-29 22:10:00,2023-10-29 23:50:00,183.0,121.0,Corto,No Perecedero
168663,136e775453bcf1788213da1bb6323882,Refrescos,Sprite,2,96.0,XA-VBV,220,BN,AW,Ciudad Principal,MX Amigos y Familia,2023-10-29 22:10:00,2023-10-29 23:50:00,183.0,121.0,Corto,No Perecedero
