In [12]:
import logging
import os
import pickle
import sys
import optuna
import numpy as np
import pandas as pd
from enum import Enum
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder

class Mode(Enum):
    TRAIN = 0
    PREDICT = 1

MODEL_PATH = './model/catboost_model_notebook.pkl'

logging.basicConfig(
    filename='./data/catboost_log_file.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

class CatBoostModel:
    def __init__(self):
        self.study = None
        self.model = None
        self.x_train = None
        self.y_train = None

    def load_data(self, dataset_path):
        """Загрузка данных"""
        try:
            data = pd.read_csv(dataset_path)
            logging.info(f"Loaded {dataset_path}")
            return data
        except FileNotFoundError:
            logging.error(f'Dataset {dataset_path} not found.')
            sys.exit(1)

    def preprocess(self, data, mode):

        feature_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
                        'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

        categorical_cols = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP']
        numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

        imputer_categorical = SimpleImputer(strategy="most_frequent")
        imputer_numerical = SimpleImputer(strategy="median")

        data[categorical_cols] = imputer_categorical.fit_transform(data[categorical_cols])
        data[numerical_cols] = imputer_numerical.fit_transform(data[numerical_cols])

        encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        encoded_cats = encoder.fit_transform(data[categorical_cols])
        encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

        data_final = pd.concat([encoded_cats_df, data[numerical_cols]], axis=1)

        if mode == Mode.TRAIN:
            y = data['Transported']
        else:
            y = None

        return data_final, y

    def objective(self, trial):
        param = {
            "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
            "depth": trial.suggest_int("depth", 1, 12),
            "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
            "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        }

        if param["bootstrap_type"] == "Bayesian":
            param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
        elif param["bootstrap_type"] == "Bernoulli":
            param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

        model = CatBoostClassifier(**param, verbose=0)
        scores = cross_val_score(model, self.x_train, self.y_train, cv=5, scoring='accuracy')
        return np.mean(scores)

    def train(self, dataset_path):
        data = self.load_data(dataset_path)
        self.x_train, self.y_train = self.preprocess(data, Mode.TRAIN)
        logging.info('Data preprocessed')

        self.study = optuna.create_study(direction="maximize")
        self.study.optimize(self.objective, n_trials=20)
        best_params = self.study.best_params
        logging.info('Optuna work completed.')

        self.model = CatBoostClassifier(**best_params)
        self.model.fit(self.x_train, self.y_train)
        logging.info('Training complete.')

        os.makedirs('./model', exist_ok=True)
        with open(MODEL_PATH, 'wb') as f:
            pickle.dump(self.model, f)
        logging.info('Model saved.')

    def predict(self, dataset_path):
        data = self.load_data(dataset_path)
        x, _ = self.preprocess(data, Mode.PREDICT)

        with open(MODEL_PATH, 'rb') as f:
            self.model = pickle.load(f)

        predictions = self.model.predict(x)
        predictions = [bool(x) for x in predictions]

        result_data = pd.DataFrame({
            'PassengerId': data['PassengerId'],
            'Transported': predictions
        })

        result_path = './data/result.csv'
        result_data.to_csv(result_path, index=False)
        logging.info(f'Predictions saved to {result_path}.')
        return predictions

if __name__ == "__main__":
    model = CatBoostModel()

    study = optuna.create_study(direction="maximize")
    study.optimize(model.objective, n_trials=10)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


NameError: name '__file__' is not defined