In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import optuna
import os



In [2]:
PATH = os.path.dirname(os.getcwd())

In [6]:
train = pd.read_csv(os.path.join(PATH, r"data/train.csv"))
test = pd.read_csv(os.path.join(PATH, r"data/test.csv"))

In [14]:
class GeneralPipeline:
    def __init__(self, catboost_params = {}, lgbm_params = {}, xgb_params = {}):
        self.catboost = CatBoostClassifier(**catboost_params, silent=True)
        self.xgboost = XGBClassifier(**xgb_params)
        self.lgbm = LGBMClassifier(**lgbm_params)
        self.metamodel = CatBoostClassifier(silent = True)

    @staticmethod
    def process_set(self, dataset, is_train): # сделать обработку датасетов сразу вместе
        '''
        Do label encoding for particular category features
        '''
        df = dataset.copy()
        df.drop(["Дата бронирования", "Заезд", "Выезд"], axis = 1)
        
        columns_to_encode = df.select_dtypes("object").columns
        if is_train:
            self.encoders = [] # если в тренировочной выборке не встречаются некоторые категории
            for col in columns_to_encode:
                encoder = LabelEncoder()
                df[col] = encoder.fit_transform(df[col])
                self.encoders.append(encoder)
        
            return df
        else:
            for i, col in enumerate(columns_to_encode):
                encoder = self.encoders[i]
                df[col] = encoder.fit_transform(df[col])
        
            return df

    def fit(self, train):
        x_train, y_train = train.drop("target", axis = 1), train["target"]
        x_train = self.process_set(self, x_train, is_train = True)
        
        catboost_oof = cross_val_predict(self.catboost, x_train, y_train, method = "predict_proba")[:, 1]
        xgboost_oof = cross_val_predict(self.xgboost, x_train, y_train, method = "predict_proba")[:, 1]
        lgbm_oof = cross_val_predict(self.lgbm, x_train, y_train, method = "predict_proba")[:, 1]

        print("OOF Catboost ROC-AUC: {}".format(round(roc_auc_score(y_train, catboost_oof),4)))
        print("OOF XGBoost ROC-AUC: {}".format(round(roc_auc_score(y_train, xgboost_oof), 4)))
        print("OOF LGBM ROC-AUC: {}".format(round(roc_auc_score(y_train, lgbm_oof),4)))

        X_oof = np.concatenate([catboost_oof[:, None], xgboost_oof[:, None], lgbm_oof[:, None]], axis = 1)
        self.metamodel.fit(X_oof, y_train)

        self.catboost.fit(x_train, y_train)
        self.xgboost.fit(x_train, y_train)
        self.lgbm.fit(x_train, y_train)

        return self

    def predict(self, test):
        x_test = self.process_set(self, test, is_train = False)
        #print(x_test)
        predictions_catboost = self.catboost.predict_proba(x_test)[:, 1]
        predictions_xgboost = self.xgboost.predict_proba(x_test)[:,1]
        predictions_lgbm = self.lgbm.predict_proba(x_test)[:, 1]

        predictions_for_meta = np.concatenate([predictions_catboost[:, None], predictions_xgboost[:, None], 
                                               predictions_lgbm[:, None]], axis = 1)
        
        final_predictions = self.metamodel.predict(predictions_for_meta)
        return final_predictions

In [16]:
a = GeneralPipeline()
a.fit(train)
predictions = a.predict(test)

OOF Catboost ROC-AUC: 0.8411
OOF XGBoost ROC-AUC: 0.8239
OOF LGBM ROC-AUC: 0.8367
