In [47]:
import pickle
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, KBinsDiscretizer
from feature_engine.encoding import MeanEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

import mlflow
import optuna
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from catboost import CatBoostClassifier

In [48]:
import warnings
warnings.filterwarnings("ignore")

In [51]:
df = pd.read_pickle("../data/cleaned_data.pkl")

In [52]:
df = df.drop(['Booking_ID', 'full_arrival_date'], axis=1)

In [53]:
X = df.drop(columns=['booking_status'])
y = df['booking_status'].replace({'Not_Canceled': 0, 'Canceled': 1})

In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y)

In [55]:
X_train.shape, X_test.shape

((27120, 19), (9040, 19))

In [56]:
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print('Числовые признаки:')
numerical_features

Числовые признаки:


['no_of_adults',
 'no_of_children',
 'no_of_weekend_nights',
 'no_of_week_nights',
 'lead_time',
 'no_of_previous_cancellations',
 'no_of_previous_bookings_not_canceled',
 'avg_price_per_room',
 'price_deviation']

In [57]:
categorical_features = df.select_dtypes(include=['category']).columns.tolist()
print('Категориальные признаки:')
categorical_features

Категориальные признаки:


['type_of_meal_plan',
 'required_car_parking_space',
 'room_type_reserved',
 'arrival_year',
 'arrival_month',
 'arrival_date',
 'market_segment_type',
 'repeated_guest',
 'no_of_special_requests',
 'is_weekend']

In [58]:
def calc_metrics(y_pred, y_proba, y_test):
    metrics = {
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_proba),  
    }

    metrics = pd.DataFrame(metrics, index=["Score"]).T
    return metrics

### baseline

In [59]:
# Трансформация признаков
transformer = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', MeanEncoder(variables=categorical_features), categorical_features)  

    ]
)

# Определение модели
rf = RandomForestClassifier(random_state=42)

# Создание пайплайна
pipeline = Pipeline(steps=[
    ('transform', transformer),
    ('classification', rf) 
])
params = pipeline.get_params()

In [60]:
pipeline.fit(X_train, y_train)

In [61]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1] # вероятности для roc_auc

metrics = calc_metrics(y_pred, y_proba, y_test)
metrics

Unnamed: 0,Score
Precision,0.884998
Recall,0.80869
F1-Score,0.845125
ROC AUC,0.957597


Базовая модель дает неплохое качество

In [62]:
input_example = X_train.head(5)
signature = infer_signature(model_input=X_train.head(5))

In [63]:
mlflow.set_tracking_uri(uri="http://localhost:5000")

Логируем сигнатуру модель, пример входных данных, requirements.txt, значения метрик, параметры пайплайна обучения 

In [64]:
with mlflow.start_run(run_name='rf_model_baseline') as run:
    mlflow.sklearn.log_model(
        pipeline, 'rf_model_baseline', 
        signature=signature, 
        input_example=input_example,
        pip_requirements='../requirements.txt',
        registered_model_name='cancellation_prediction_models'
    )

    for metric, row in metrics.iterrows():
        mlflow.log_metric(metric, row['Score'])

    mlflow.log_params(params)

Successfully registered model 'cancellation_prediction_models'.
2024/12/08 12:42:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cancellation_prediction_models, version 1
Created version '1' of model 'cancellation_prediction_models'.
Downloading artifacts: 100%|██████████| 7/7 [00:01<00:00,  5.59it/s] 
2024/12/08 12:42:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run rf_model_baseline at: http://localhost:5000/#/experiments/0/runs/8ebde46263b54923be1af15993a82730.
2024/12/08 12:42:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/0.


### feature engineering

In [65]:
# Тансформация признаков
poly_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

transformer = ColumnTransformer(
    transformers=[
        ('poly', poly_pipeline, ['lead_time', 'avg_price_per_room']),
        ('kbin', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='uniform'), 
         ['lead_time', 'avg_price_per_room']),
        ('num', StandardScaler(), numerical_features),
        ('cat', MeanEncoder(variables=categorical_features), categorical_features)
    ]
)

# Создание пайплайна
pipeline = Pipeline(steps=[
    ('transform', transformer),
    ('classification', rf) 
])

params = pipeline.get_params()

Обучаем ColumnTransformer и сохраняем названий признаков в файл

In [66]:
X_train_fe_sklearn = transformer.fit_transform(X_train, y_train)

feature_columns = transformer.get_feature_names_out()
with open('../data/feature_columns.txt', 'w') as f:
    f.write("\n".join(feature_columns))

In [67]:
pipeline.fit(X_train, y_train)

In [68]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1] # вероятности для roc_auc

metrics = calc_metrics(y_pred, y_proba, y_test)
metrics

Unnamed: 0,Score
Precision,0.881057
Recall,0.808353
F1-Score,0.843141
ROC AUC,0.95475


Преобразования не увеличили качество

В логирование добавляем файл с названием признаков

In [69]:
with mlflow.start_run(run_name='rf_model_with_feature_engineering') as run:
    mlflow.sklearn.log_model(
        pipeline, 'rf_model_with_feature_engineering', 
        signature=signature, 
        input_example=input_example,
        pip_requirements='../requirements.txt'
    )

    for metric, row in metrics.iterrows():
        mlflow.log_metric(metric, row['Score'])

    mlflow.log_params(params)

    mlflow.log_artifact("../data/feature_columns.txt")

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00,  8.03it/s]   
2024/12/08 12:43:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run rf_model_with_feature_engineering at: http://localhost:5000/#/experiments/0/runs/9ecbf44089d5489f955b80d05fcd968f.
2024/12/08 12:43:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/0.


### feature selection

In [70]:
sfs = SFS(
    RandomForestClassifier(random_state=42),
    k_features=20,
    forward=True,
    floating=False,
    scoring='roc_auc',
    cv=2,
    n_jobs=-1
)

In [71]:
sfs.fit(X_train_fe_sklearn, y_train)

Создаем список с отобранными признаками

In [72]:
selected_features_idx = list(sfs.k_feature_idx_)
selected_features_names = [feature_columns[i] for i in selected_features_idx]
selected_features_names

['poly__lead_time avg_price_per_room',
 'poly__avg_price_per_room^2',
 'kbin__lead_time_1.0',
 'kbin__lead_time_2.0',
 'kbin__avg_price_per_room_0.0',
 'kbin__avg_price_per_room_1.0',
 'num__no_of_adults',
 'num__no_of_weekend_nights',
 'num__no_of_week_nights',
 'num__lead_time',
 'num__no_of_previous_cancellations',
 'cat__type_of_meal_plan',
 'cat__required_car_parking_space',
 'cat__room_type_reserved',
 'cat__arrival_year',
 'cat__arrival_month',
 'cat__arrival_date',
 'cat__market_segment_type',
 'cat__repeated_guest',
 'cat__no_of_special_requests']

Сохраняем названия и индексы отобранных признаков

In [73]:
with open("../data/selected_features_idx.txt", "w") as f:
    f.write("\n".join(map(str, selected_features_idx)))

with open("../data/selected_features_names.txt", "w") as f:
    f.write("\n".join(selected_features_names))

In [74]:
# Оставляем отобранные признаки
feature_selector = ColumnTransformer(
    transformers=[
        ('select_columns', 'passthrough', selected_features_idx)
    ], remainder='drop'
)

# Создание пайплайна
pipeline = Pipeline([
    ('transformer', transformer),
    ('feature_selection', feature_selector),
    ('model', rf)
])

params = pipeline.get_params()

In [75]:
pipeline.fit(X_train, y_train)

In [76]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1] # вероятности для roc_auc

metrics = calc_metrics(y_pred, y_proba, y_test)
metrics

Unnamed: 0,Score
Precision,0.882909
Recall,0.817784
F1-Score,0.849099
ROC AUC,0.958205


Качество выросло по сравнению с базовой моделью

В логирование добавляем файлы с названиями и индексами признаков

In [77]:
with mlflow.start_run(run_name='rf_model_with_feature_selection') as run:
    mlflow.sklearn.log_model(
        pipeline, 'rf_model_with_feature_selection', 
        signature=signature, 
        input_example=input_example,
        pip_requirements='../requirements.txt'
    )

    for metric, row in metrics.iterrows():
        mlflow.log_metric(metric, row['Score'])

    mlflow.log_params(params)

    mlflow.log_artifact("../data/selected_features_idx.txt")
    mlflow.log_artifact("../data/selected_features_names.txt")

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 12.92it/s]  
2024/12/08 12:49:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run rf_model_with_feature_selection at: http://localhost:5000/#/experiments/0/runs/fd644eeb307d4bdcae3e4b900e57a6fb.
2024/12/08 12:49:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/0.


### hyperparameter tuning

In [90]:
def objective(trial):
    # Гиперпараметры
    n_estimators = trial.suggest_int('n_estimators', 10, 200, step=10)
    max_depth = trial.suggest_int('max_depth', 5, 50, step=5)
    max_features = trial.suggest_uniform('max_features', 0.1, 1.0)
    
    rf = RandomForestClassifier(n_estimators=n_estimators, 
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   random_state=42)
    
    pipeline = Pipeline([
        ('transformer', transformer),
        ('feature_selection', feature_selector),
        ('model', rf)
    ])
  
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = f1_score(y_test, y_pred, average='macro')
    
    return score 

# Оптимизация гиперпараметров
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[I 2024-12-08 12:54:46,826] A new study created in memory with name: no-name-f540bcd7-ac38-4549-a398-1416a939ad39


[I 2024-12-08 12:54:49,812] Trial 0 finished with value: 0.8119301012100103 and parameters: {'n_estimators': 70, 'max_depth': 5, 'max_features': 0.7436917389566954}. Best is trial 0 with value: 0.8119301012100103.
[I 2024-12-08 12:55:02,410] Trial 1 finished with value: 0.8902304095325524 and parameters: {'n_estimators': 200, 'max_depth': 35, 'max_features': 0.4281529628214026}. Best is trial 1 with value: 0.8902304095325524.
[I 2024-12-08 12:55:05,237] Trial 2 finished with value: 0.7215664710297542 and parameters: {'n_estimators': 170, 'max_depth': 5, 'max_features': 0.20513142621457464}. Best is trial 1 with value: 0.8902304095325524.
[I 2024-12-08 12:55:16,203] Trial 3 finished with value: 0.8853776414199434 and parameters: {'n_estimators': 100, 'max_depth': 40, 'max_features': 0.8852782875426464}. Best is trial 1 with value: 0.8902304095325524.
[I 2024-12-08 12:55:26,489] Trial 4 finished with value: 0.8913025806661392 and parameters: {'n_estimators': 190, 'max_depth': 30, 'max_fe

In [91]:
# Определение модели с подобранными гиперпараметрами
rf_optuna = RandomForestClassifier(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    max_features=study.best_params['max_features'],
    random_state=42
)

# Создание пайплайна
pipeline = Pipeline([
    ('transformer', transformer),
    ('feature_selection', feature_selector),
    ('model', rf_optuna)
])

params = pipeline.get_params()

In [92]:
pipeline.fit(X_train, y_train)

In [93]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1] # вероятности для roc_auc

metrics = calc_metrics(y_pred, y_proba, y_test)
metrics

Unnamed: 0,Score
Precision,0.881441
Recall,0.823846
F1-Score,0.851671
ROC AUC,0.958859


Пока лучшее качество, которое удалось получить

In [94]:
with mlflow.start_run(run_name='rf_model_optuna') as run:
    mlflow.sklearn.log_model(
        pipeline, 'rf_model_optuna', 
        signature=signature, 
        input_example=input_example,
        pip_requirements='../requirements.txt',
        registered_model_name='cancellation_prediction_models'
    )

    for metric, row in metrics.iterrows():
        mlflow.log_metric(metric, row['Score'])

    mlflow.log_params(params)

    mlflow.log_artifact("../data/selected_features_idx.txt")
    mlflow.log_artifact("../data/selected_features_names.txt")

Registered model 'cancellation_prediction_models' already exists. Creating a new version of this model...
2024/12/08 12:56:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cancellation_prediction_models, version 2
Created version '2' of model 'cancellation_prediction_models'.
Downloading artifacts: 100%|██████████| 7/7 [00:01<00:00,  6.86it/s]   
2024/12/08 12:56:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run rf_model_optuna at: http://localhost:5000/#/experiments/0/runs/68ab8bfd8ff24621948ae7c4c61b2098.
2024/12/08 12:56:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/0.


### CatBoost

In [95]:
cb = CatBoostClassifier(iterations=100)
cb.fit(X_train_fe_sklearn, y_train)
params = cb.get_all_params()

Learning rate set to 0.348296
0:	learn: 0.5162026	total: 85.3ms	remaining: 8.44s
1:	learn: 0.4341284	total: 241ms	remaining: 11.8s
2:	learn: 0.4034249	total: 268ms	remaining: 8.68s
3:	learn: 0.3784111	total: 369ms	remaining: 8.86s
4:	learn: 0.3674338	total: 383ms	remaining: 7.28s
5:	learn: 0.3599636	total: 424ms	remaining: 6.64s
6:	learn: 0.3523871	total: 465ms	remaining: 6.18s
7:	learn: 0.3450473	total: 474ms	remaining: 5.45s
8:	learn: 0.3404626	total: 484ms	remaining: 4.9s
9:	learn: 0.3351794	total: 529ms	remaining: 4.76s
10:	learn: 0.3317940	total: 543ms	remaining: 4.39s
11:	learn: 0.3288649	total: 549ms	remaining: 4.02s
12:	learn: 0.3262122	total: 558ms	remaining: 3.73s
13:	learn: 0.3218167	total: 570ms	remaining: 3.5s
14:	learn: 0.3182958	total: 598ms	remaining: 3.39s
15:	learn: 0.3157116	total: 640ms	remaining: 3.36s
16:	learn: 0.3134567	total: 658ms	remaining: 3.21s
17:	learn: 0.3110855	total: 700ms	remaining: 3.19s
18:	learn: 0.3093553	total: 739ms	remaining: 3.15s
19:	learn: 0

In [96]:
y_pred = cb.predict(transformer.transform(X_test))
y_proba = cb.predict_proba(transformer.transform(X_test))[:, 1]

metrics = calc_metrics(y_pred, y_proba, y_test)
metrics

Unnamed: 0,Score
Precision,0.85406
Recall,0.78646
F1-Score,0.818867
ROC AUC,0.948361


Качество хуже

In [97]:
with mlflow.start_run(run_name='cb_model') as run:
    mlflow.sklearn.log_model(
        cb, 'cb_model', 
        signature=infer_signature(model_input=X_train_fe_sklearn[:5,:]), 
        input_example=X_train_fe_sklearn[:5,:],
        pip_requirements='../requirements.txt',
        registered_model_name='cancellation_prediction_models'
    )

    for metric, row in metrics.iterrows():
        mlflow.log_metric(metric, row['Score'])

    mlflow.log_params(params)

    mlflow.log_artifact("../data/selected_features_idx.txt")
    mlflow.log_artifact("../data/selected_features_names.txt")

Registered model 'cancellation_prediction_models' already exists. Creating a new version of this model...
2024/12/08 12:57:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cancellation_prediction_models, version 3
Created version '3' of model 'cancellation_prediction_models'.
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1265.80it/s] 
2024/12/08 12:57:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run cb_model at: http://localhost:5000/#/experiments/0/runs/80c1849b0a3b4dba8e8053d8d888d30b.
2024/12/08 12:57:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/0.


### Best model

In [98]:
# Создание пайплайна
pipeline = Pipeline([
    ('transformer', transformer),
    ('feature_selection', feature_selector),
    ('model', rf_optuna)
])

params = pipeline.get_params()

pipeline.fit(X, y)

Добавляем тег production

In [99]:
with mlflow.start_run(run_name='best_model') as run:
    mlflow.sklearn.log_model(
        pipeline, 'best_model', 
        signature=signature,
        input_example=input_example,
        pip_requirements='../requirements.txt',
        registered_model_name='cancellation_prediction_models'
    )

    mlflow.log_params(params)

    mlflow.log_artifact("../data/selected_features_idx.txt")
    mlflow.log_artifact("../data/selected_features_names.txt")

Registered model 'cancellation_prediction_models' already exists. Creating a new version of this model...
2024/12/08 12:57:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cancellation_prediction_models, version 4
Created version '4' of model 'cancellation_prediction_models'.
Downloading artifacts: 100%|██████████| 7/7 [00:01<00:00,  5.48it/s]   
2024/12/08 12:57:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run best_model at: http://localhost:5000/#/experiments/0/runs/f1e402c95cd04b429cc8ec80e1311352.
2024/12/08 12:57:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/0.


Устанавливаем тэг

In [104]:
client = MlflowClient()

client.set_model_version_tag(
    name='cancellation_prediction_models', 
    version='4',
    key='production',
    value='true'
)