In [1]:
import glob
from catboost.utils import get_gpu_device_count
import pandas as pd
import numpy as np
import random
from sklearn.metrics import mean_absolute_error

In [113]:
from sklearn.model_selection import train_test_split

In [3]:
is_gpu_available = get_gpu_device_count()
device = 'GPU' if is_gpu_available else 'CPU'

device

'CPU'

In [5]:
np.random.seed(123)
random.seed(123)

In [7]:
import openpyxl

In [9]:
df_airports = pd.read_excel("airports.xlsx")

In [11]:
cities = df_airports['airport'].tolist()
cities = list(map(str.strip, cities))

In [13]:
n_samples = 300000

In [15]:
date_range = pd.date_range('2022-01-01', '2024-12-31', freq='D')

In [17]:
flights = {
    'flight_number': [f'FL{random.randint(100, 999)}' for _ in range(n_samples)],
    'departure_city': np.random.choice(cities, n_samples),
    'arrival_city': np.random.choice(cities, n_samples),
    'departure_time': pd.Timestamp(np.random.choice(date_range)).strftime("%Y-%m-%d"),
    'departure_hour': np.random.randint(0, 24, n_samples)
}
flights_df = pd.DataFrame(flights)

Unnamed: 0,flight_number,departure_city,arrival_city,departure_time,departure_hour
0,FL153,Томск (Богашево),Абакан,2022-11-12,22
1,FL374,Иркутск,Магадан,2022-11-12,2
2,FL189,Санкт-Петербург (Пулково),Якутск,2022-11-12,13
3,FL887,Остафьево,Красноярск (Емельяново),2022-11-12,13
4,FL517,Ханты-Мансийск,Нижнекамск (Бегишево),2022-11-12,16
...,...,...,...,...,...
299995,FL919,Элиста,Махачкала,2022-11-12,0
299996,FL832,Магадан,Тобольск (Ремезов),2022-11-12,0
299997,FL678,Ульяновск (Баратаевка),Астрахань (Нариманово),2022-11-12,6
299998,FL682,Махачкала,Екатеринбург (Кольцово),2022-11-12,16


In [21]:
flights_df['departure_time'] = pd.to_datetime(flights_df['departure_time'])

In [23]:
flights_df['is_weekend'] = flights_df['departure_time'].dt.weekday.isin([5, 6]).astype(int)  # 1, если сб/вс
flights_df['season'] = flights_df['departure_time'].dt.month.map(lambda m: 1 if m in [12, 1, 2] else
                                                                            2 if m in [3, 4, 5] else
                                                                            3 if m in [6, 7, 8] else 4)
flights_df['day_of_week'] = flights_df['departure_time'].dt.weekday  # 0 - Пн, 6 - Вс

In [81]:
flights_df

Unnamed: 0,flight_number,departure_city,arrival_city,departure_time,departure_hour,is_weekend,season,day_of_week
0,FL153,Томск (Богашево),Абакан,2022-11-12,22,1,4,5
1,FL374,Иркутск,Магадан,2022-11-12,2,1,4,5
2,FL189,Санкт-Петербург (Пулково),Якутск,2022-11-12,13,1,4,5
3,FL887,Остафьево,Красноярск (Емельяново),2022-11-12,13,1,4,5
4,FL517,Ханты-Мансийск,Нижнекамск (Бегишево),2022-11-12,16,1,4,5
...,...,...,...,...,...,...,...,...
299995,FL919,Элиста,Махачкала,2022-11-12,0,1,4,5
299996,FL832,Магадан,Тобольск (Ремезов),2022-11-12,0,1,4,5
299997,FL678,Ульяновск (Баратаевка),Астрахань (Нариманово),2022-11-12,6,1,4,5
299998,FL682,Махачкала,Екатеринбург (Кольцово),2022-11-12,16,1,4,5


In [55]:
#данные про бронированию
total_bookings = np.random.randint(100, 250, size=n_samples)

k = 0.15  
t_0 = 30  

days_before = np.array([365, 90, 30, 14, 7, 1, 0])

booking_data = []
for total in total_bookings:
    bookings = total / (1 + np.exp(k * (days_before - t_0)))
    bookings = np.round(bookings).astype(int)
    
    noise = np.random.randint(0, 6)
    bookings += noise 

    booking_data.append(bookings)

df_booking = pd.DataFrame(booking_data, columns=[f'booking_{d}_d_before' for d in days_before])

In [79]:
df_booking

Unnamed: 0,booking_365_d_before,booking_90_d_before,booking_30_d_before,booking_14_d_before,booking_7_d_before,booking_1_d_before,booking_0_d_before
0,1,1,77,141,149,152,152
1,0,0,70,128,136,138,138
2,4,4,110,198,209,213,214
3,3,3,91,164,174,177,177
4,5,5,67,120,126,128,129
...,...,...,...,...,...,...,...
299995,5,5,107,192,203,206,207
299996,1,1,101,184,195,198,199
299997,5,5,93,167,177,180,180
299998,1,1,88,161,170,173,173


In [83]:
df = pd.concat([flights_df, df_booking], axis=1)

In [71]:
#данные про билетам

flights_info = {
    'percentage_cheap_fares': np.random.uniform(0.1, 0.8, n_samples), # доля невозвратных билетов
    'fare_y_avg': np.random.randint(4000, 20000, n_samples), #среднесуточная стоимость билета эконом класса за 365 дней
    'fare_j_avg': np.random.randint(30000, 70000, n_samples), #среднесуточная стоимость билета бизнес класса за 365 дней
    
    'no_show_rate_y': np.random.uniform(0.02, 0.15, n_samples), # % неявки за все года (здесь и ниже - эконом)
    'no_show_rate_year_y': np.random.uniform(0.01, 0.14, n_samples), # за последний год
    'no_show_rate_season_y': np.random.uniform(0.03, 0.16, n_samples), # за соотв. сезон
    'no_show_rate_week_y': np.random.uniform(0.02, 0.14, n_samples), # за пред.неделю
    'cancel_rate_y': np.random.uniform(0.01, 0.03, n_samples), # % отмен
    'cancel_rate_year_y': np.random.uniform(0.01, 0.03, n_samples), # за последний год
    'cancel_rate_season_y': np.random.uniform(0.01, 0.03, n_samples),
    'cancel_rate_week_y': np.random.uniform(0.02, 0.04, n_samples),

    'no_show_rate_j': np.random.uniform(0.01, 0.1, n_samples), # % неявки за все года (здесь и ниже - бизнес)
    'no_show_rate_year_j': np.random.uniform(0.01, 0.9, n_samples), # за последний год
    'no_show_rate_season_j': np.random.uniform(0.02, 0.8, n_samples), # за соотв. сезон
    'no_show_rate_week_j': np.random.uniform(0.01, 0.8, n_samples), # за пред.неделю
    'cancel_rate_j': np.random.uniform(0.01, 0.02, n_samples), # % отмен
    'cancel_rate_year_j': np.random.uniform(0.01, 0.03, n_samples), # за последний год
    'cancel_rate_season_j': np.random.uniform(0.01, 0.03, n_samples),
    'cancel_rate_week_j': np.random.uniform(0.01, 0.02, n_samples),
    
    'overbooking_rate_avg': np.random.uniform(0.03, 0.12, n_samples), #средний % овербукинга за все года
    'overbooking_rate_last_year': np.random.uniform(0.03, 0.12, n_samples), # за последний год
}

df_flights_info = pd.DataFrame(flights_info)


In [87]:
df = pd.concat([df, df_flights_info], axis=1)

In [91]:
df = df[df['departure_city'] != df['arrival_city']]

In [93]:
df["target"] = df["overbooking_rate_avg"] * np.random.uniform(0.8, 1.2, len(df))

df.head()

Unnamed: 0,flight_number,departure_city,arrival_city,departure_time,departure_hour,is_weekend,season,day_of_week,booking_365_d_before,booking_90_d_before,...,no_show_rate_year_j,no_show_rate_season_j,no_show_rate_week_j,cancel_rate_j,cancel_rate_year_j,cancel_rate_season_j,cancel_rate_week_j,overbooking_rate_avg,overbooking_rate_last_year,target
0,FL153,Томск (Богашево),Абакан,2022-11-12,22,1,4,5,1,1,...,0.674394,0.476997,0.187654,0.015379,0.02543,0.012389,0.016517,0.090651,0.075712,0.09761
1,FL374,Иркутск,Магадан,2022-11-12,2,1,4,5,0,0,...,0.495875,0.796406,0.126625,0.012119,0.012158,0.027855,0.012245,0.075994,0.069963,0.077802
2,FL189,Санкт-Петербург (Пулково),Якутск,2022-11-12,13,1,4,5,4,4,...,0.760495,0.319231,0.509361,0.015153,0.017045,0.019518,0.010093,0.04545,0.074966,0.046933
3,FL887,Остафьево,Красноярск (Емельяново),2022-11-12,13,1,4,5,3,3,...,0.685695,0.290011,0.574078,0.018985,0.012502,0.016529,0.013847,0.040517,0.037269,0.03568
4,FL517,Ханты-Мансийск,Нижнекамск (Бегишево),2022-11-12,16,1,4,5,5,5,...,0.242368,0.069495,0.575721,0.017879,0.022106,0.013723,0.012808,0.107953,0.110926,0.125383


In [None]:
!pip install -U lightautoml
#!pip install --upgrade pip

In [135]:
import torch
from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML

from lightautoml.tasks import Task
import os
import time


In [105]:
N_THREADS = 4
N_FOLDS = 7
RANDOM_STATE = 123
TIMEOUT = 60 * 5 #should be increased in reality!
TARGET_NAME = 'target'

In [107]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(RANDOM_STATE)

In [109]:
torch.set_num_threads(N_THREADS)

In [115]:
tr_data, te_data = train_test_split(
    df,
    test_size=0.2, 
    random_state=RANDOM_STATE
)

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

tr_data.head()

Data splitted. Parts sizes: tr_data = (237157, 37), te_data = (59290, 37)


Unnamed: 0,flight_number,departure_city,arrival_city,departure_time,departure_hour,is_weekend,season,day_of_week,booking_365_d_before,booking_90_d_before,...,no_show_rate_year_j,no_show_rate_season_j,no_show_rate_week_j,cancel_rate_j,cancel_rate_year_j,cancel_rate_season_j,cancel_rate_week_j,overbooking_rate_avg,overbooking_rate_last_year,target
173689,FL501,Самара (Курумоч),Петрозаводск (Бесовец),2022-11-12,6,1,4,5,5,5,...,0.508466,0.353778,0.199197,0.010542,0.022062,0.02491,0.017925,0.099343,0.084563,0.116684
295554,FL426,Симферополь,Курган,2022-11-12,16,1,4,5,4,4,...,0.07909,0.038437,0.453731,0.012124,0.017443,0.018762,0.016967,0.03846,0.085713,0.040672
158074,FL382,Сыктывкар,Самара (Курумоч),2022-11-12,7,1,4,5,2,2,...,0.676708,0.654318,0.525856,0.015391,0.026522,0.016948,0.017826,0.071445,0.078354,0.0641
7293,FL622,Нижний Новгород (Стригино),Анадырь (Угольный),2022-11-12,17,1,4,5,3,3,...,0.13934,0.235491,0.105949,0.011155,0.016006,0.021225,0.017976,0.103985,0.083686,0.090278
230018,FL213,Анапа (Витязево),Москва (Шереметьево),2022-11-12,6,1,4,5,1,1,...,0.324539,0.618677,0.732255,0.010525,0.024324,0.025411,0.013208,0.077217,0.032084,0.088303


In [117]:
task = Task(
    'reg', 
    loss='mse',
    metric='mae'
)

In [123]:
roles = {
    'target': TARGET_NAME,
    'category': ['departure_city', 'arrival_city', 'is_weekend', 'season', 'day_of_week'],
    'date': ['departure_time'],
    'drop': ['flight_number	']
}

In [139]:
utilized_automl = TabularUtilizedAutoML(
    task=task,
    timeout=TIMEOUT,
    cpu_limit=N_THREADS,
    reader_params={'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
)

In [141]:
%%time 
oof_pred = utilized_automl.fit_predict(tr_data, roles=roles, verbose=3)

[21:31:35] Start automl [1mutilizator[0m with listed constraints:
[21:31:35] - time: 300.00 seconds
[21:31:35] - CPU: 4 cores
[21:31:35] - memory: 16 GB

[21:31:35] [1mIf one preset completes earlier, next preset configuration will be started[0m

[21:31:35] Start 0 automl preset configuration:
[21:31:35] [1mC:\Users\Legion\anaconda3\Lib\site-packages\lightautoml\automl\presets\tabular_configs\conf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'nn_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[21:31:35] Found reader_params in kwargs, need to combine
[21:31:35] Merged variant for reader_params = {'n_jobs': 4, 'cv': 7, 'random_state': 42}
[21:31:35] Stdout logging level is INFO3.
[21:31:35] Task: reg

[21:31:35] Start automl preset with listed constraints:
[21:31:35] - time: 300.00 seconds
[21:31:35] - CPU: 4 cores
[21:31:35] - memory: 16 GB

[21:31:35] [1mTrain data shape: (237157, 37)[0m

[21:31:43] Feats was rej

  return function_base._ureduce(a, func=_nanmedian, keepdims=keepdims,


[21:31:49] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[21:31:49] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[21:31:50] Linear model: C = 1e-05 score = -0.007861908570942737
[21:31:50] Linear model: C = 5e-05 score = -0.007536912903224573
[21:31:50] Linear model: C = 0.0001 score = -0.0075290459086676765
[21:31:50] Linear model: C = 0.0005 score = -0.007514635241807351
[21:31:50] Linear model: C = 0.001 score = -0.0075063824014011595
[21:31:50] Linear model: C = 0.005 score = -0.007500615832184484
[21:31:50] Linear model: C = 0.01 score = -0.007500615832184484
[21:31:50] Linear model: C = 0.05 score = -0.0074969647451276905
[21:31:50] Linear model: C = 0.1 score = -0.0074969647451276905
[21:31:50] Linear model: C = 0.5 score = -0.0074969647451276905
[21:31:50] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[21:31:51] Linear model: C = 1e-05 score = -0.007811804255352643
[21:31:51] Linear model: 

In [142]:
print(utilized_automl.create_model_str_desc())

Final prediction for new objects = 
	1.00000 * 1 averaged models with config = "C:\Users\Legion\anaconda3\Lib\site-packages\lightautoml\automl\presets\tabular_configs\conf_0_sel_type_0.yml" and different CV random_states. Their structures: 

	    Model #0.
		Final prediction for new objects (level 0) = 
			 0.18709 * (7 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
			 0.41237 * (7 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
			 0.29180 * (7 averaged models Lvl_0_Pipe_1_Mod_1_Tuned_LightGBM) +
			 0.05437 * (7 averaged models Lvl_0_Pipe_1_Mod_2_CatBoost) +
			 0.05437 * (7 averaged models Lvl_0_Pipe_1_Mod_3_Tuned_CatBoost) 





In [143]:
te_pred = utilized_automl.predict(te_data)
print(f'Prediction for te_data:\n{te_pred}\nShape = {te_pred.shape}')

Prediction for te_data:
array([[0.07460571],
       [0.06749522],
       [0.0423739 ],
       ...,
       [0.07041833],
       [0.10908514],
       [0.05473463]], dtype=float32)
Shape = (59290, 1)


In [144]:
print(f'OOF score: {mean_absolute_error(tr_data[TARGET_NAME].values, oof_pred.data[:, 0])}')
print(f'HOLDOUT score: {mean_absolute_error(te_data[TARGET_NAME].values, te_pred.data[:, 0])}')

OOF score: 0.007488128631771307
HOLDOUT score: 0.007478365193945785


In [147]:
#import joblib

#joblib.dump(utilized_automl, "automl_model_final.pkl")

['automl_model_final.pkl']

Для ввода тестовых данных менеджером:

In [167]:
column_list = [
    'booking_365_d_before', 'booking_90_d_before', 
    'booking_30_d_before', 'booking_14_d_before', 'booking_7_d_before', 'booking_1_d_before',
    'booking_0_d_before', 'percentage_cheap_fares', 'fare_y_avg', 'fare_j_avg', 'no_show_rate_y',
    'no_show_rate_year_y', 'no_show_rate_season_y', 'no_show_rate_week_y', 'cancel_rate_y', 
    'cancel_rate_year_y', 'cancel_rate_season_y', 'cancel_rate_week_y', 'no_show_rate_j', 
    'no_show_rate_year_j', 'no_show_rate_season_j', 'no_show_rate_week_j', 'cancel_rate_j', 
    'cancel_rate_year_j', 'cancel_rate_season_j', 'cancel_rate_week_j', 'overbooking_rate_avg',
    'overbooking_rate_last_year'
]

In [169]:
def create_test_df(df, departure_city, arrival_city, departure_time, departure_hhmm, column_list):
    pair_exists = ((df['departure_city'] == departure_city) & (df['arrival_city'] == arrival_city)).any()
    
    departure_hour = int(str(departure_hhmm).zfill(4)[:2])
    
    test_df = pd.DataFrame({
        'departure_city': [departure_city],
        'arrival_city': [arrival_city],
        'departure_time': [pd.to_datetime(departure_time)],
        'departure_hhmm': [departure_hour]
    })
    
    test_df['is_weekend'] = test_df['departure_time'].dt.weekday.isin([5, 6]).astype(int)
    test_df['season'] = test_df['departure_time'].dt.month.map(lambda m: 1 if m in [12, 1, 2] else
                                                                            2 if m in [3, 4, 5] else
                                                                            3 if m in [6, 7, 8] else 4)
    test_df['day_of_week'] = test_df['departure_time'].dt.weekday
    
    # источник средних значений
    if pair_exists:
        historical_data = df[(df['departure_city'] == departure_city) & (df['arrival_city'] == arrival_city)]
    else:
        historical_data = df[df['departure_city'] == departure_city]
    
    for col in column_list:
        if col in df.columns:
            test_df[col] = historical_data[col].mean() * 0.75
    
    return test_df

In [None]:
te_df = create_test_df(df, departure_city, arrival_city, departure_time, departure_hhmm, column_list)

In [None]:
pred = utilized_automl.predict(te_df).data.flatten()
print(pred)