# Спортивный анализ данных. Платформа Kaggle. Курсовой проект

## Загрузка данных и поключение библиотек

In [None]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

kaggle.json


In [None]:
!kaggle competitions download -c geekbrains-competitive-data-analysis

Downloading train.csv.zip to /content
  0% 0.00/434k [00:00<?, ?B/s]
100% 434k/434k [00:00<00:00, 66.1MB/s]
Downloading client_profile.csv.zip to /content
 43% 5.00M/11.7M [00:00<00:00, 8.31MB/s]
100% 11.7M/11.7M [00:00<00:00, 18.4MB/s]
Downloading applications_history.csv.zip to /content
 68% 41.0M/59.9M [00:01<00:01, 11.5MB/s]
100% 59.9M/59.9M [00:02<00:00, 31.1MB/s]
Downloading test.csv.zip to /content
  0% 0.00/632k [00:00<?, ?B/s]
100% 632k/632k [00:00<00:00, 144MB/s]
Downloading payments.csv.zip to /content
 43% 9.00M/21.0M [00:00<00:01, 11.2MB/s]
100% 21.0M/21.0M [00:01<00:00, 21.6MB/s]
Downloading sample_submit.csv.zip to /content
  0% 0.00/586k [00:00<?, ?B/s]
100% 586k/586k [00:00<00:00, 185MB/s]
Downloading bki.csv.zip to /content
 20% 5.00M/25.2M [00:00<00:02, 8.80MB/s]
100% 25.2M/25.2M [00:00<00:00, 39.6MB/s]


In [None]:
!pip install catboost
!pip install eli5
!pip install shap
!unzip /content/applications_history.csv.zip
!unzip /content/bki.csv.zip
!unzip /content/client_profile.csv.zip
!unzip /content/payments.csv.zip
!unzip /content/sample_submit.csv.zipa
!unzip /content/test.csv.zip
!unzip /content/train.csv.zip

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 61kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/d1/54/04cab6e1c0ae535bec93f795d8403fdf6caf66fa5a6512263202dbb14ea6/eli5-0.11.0-py2.py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 13.4MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.11.0
Collecting shap
[?25l  Downloading https://files.pythonhosted.org/packages/b9/f4/c5b95cddae15be80f8e58b25edceca105aa83c0b8c86a1edad24a6af80d3/shap-0.39.0.tar.gz (356kB)
[K     |████████████████████████████████| 358kB 13.6MB/s 
Collecting slicer==0.0.7
  Downloading https://files.pythonhosted.org/packages/78/c2/b3f55dfdb8af9812fdb9baf70cacf3b

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn import metrics
from typing import List, Optional
from typing import List, Tuple
import matplotlib.pyplot as plt
import seaborn as sns
import eli5
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import time
import lightgbm as lgb
import shap
from sklearn.inspection import permutation_importance
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import train_test_split, KFold



## Описание используемых функций

In [None]:
def plot_feature_importance(importance, names, model_type):

  #Create arrays from feature importance and feature names
  feature_importance = np.array(importance)
  feature_names = np.array(names)

  #Create a DataFrame using a Dictionary
  data={'feature_names':feature_names,'feature_importance':feature_importance}
  fi_df = pd.DataFrame(data)

  #Sort the DataFrame in order decreasing feature importance
  fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

  #Define size of bar plot
  plt.figure(figsize=(10,8))
  #Plot Searborn bar chart
  sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
  #Add chart labels
  plt.title(model_type + ' FEATURE IMPORTANCE')
  plt.xlabel('FEATURE IMPORTANCE')
  plt.ylabel('FEATURE NAMES')
  return list(fi_df['feature_names'].values)

In [None]:
def frequency_encoder(data: pd.DataFrame,
                            features: List[str],
                            delete_old = False,
                            ) -> pd.DataFrame:
    """
    Уникальные значения в признаках features датафрейма
    data заменяются частотой их появления в этом датафрейме

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Датафрейм для преобразования признаков.

    features: List[str]
        Список с названием признаков, для которых произвести
        частотное кодирование.

    delete_old - флаг - удалять ли исходные признаки

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Выборка с добавленными (удаленными признаками).

    """
    for feature in features:
      stat = data[feature].value_counts().reset_index().rename(columns={'index': feature, feature: feature+'_cnt'})
      stat[feature+'_freq'] = stat[feature+'_cnt']/stat[feature+'_cnt'].sum()
      stat.drop(columns=[feature+'_cnt'], inplace=True)
      data = pd.merge(data, stat, how='left', on=feature)
      del stat
      if delete_old == True:
        data.drop(columns=[feature], inplace=True)
    return data

In [None]:
def target_encoder(data: pd.DataFrame,
                            features: List[str],
                            target: str, 
                            delete_old = False,
                            ) -> pd.DataFrame:
    """
    Уникальные значения в признаках features датафрейма
    data заменяются средним значением целевой переменной в этом датафрейме

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Датафрейм для преобразования признаков.

    features: List[str]
        Список с названием признаков, для которых произвести
        частотное кодирование.

    delete_old - флаг - удалять ли исходные признаки

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Выборка с добавленными (удаленными признаками).

    """
    for feature in features:
      stat = data.groupby(feature)[target].mean().reset_index()
      stat.rename(columns={target:'educ_mean_' + target}, inplace=True)
      data = pd.merge(data, stat, how='left', on=feature)
      del stat
      if delete_old == True:
        data.drop(columns=[feature], inplace=True)
    return data

In [None]:
def create_bootstrap_samples(data: np.array, n_samples: int = 1000) -> np.array:
    """
    Создание бутстреп-выборок.

    Parameters
    ----------
    data: np.array
        Исходная выборка, которая будет использоваться для
        создания бутстреп выборок.

    n_samples: int, optional, default = 1000
        Количество создаваемых бутстреп выборок.
        Опциональный параметр, по умолчанию, равен 1000.

    Returns
    -------
    bootstrap_idx: np.array
        Матрица индексов, для создания бутстреп выборок.

    """
    bootstrap_idx = np.random.randint(
        low=0, high=len(data), size=(n_samples, len(data))
    )
    return bootstrap_idx


def create_bootstrap_metrics(y_true: np.array,
                             y_pred: np.array,
                             metric: callable,
                             n_samlpes: int = 1000) -> List[float]:
    """
    Вычисление бутстреп оценок.

    Parameters
    ----------
    y_true: np.array
        Вектор целевой переменной.

    y_pred: np.array
        Вектор прогнозов.

    metric: callable
        Функция для вычисления метрики.
        Функция должна принимать 2 аргумента: y_true, y_pred.

    n_samples: int, optional, default = 1000
        Количество создаваемых бутстреп выборок.
        Опциональный параметр, по умолчанию, равен 1000.

    Returns
    -------
    bootstrap_metrics: List[float]
        Список со значениями метрики качества на каждой бустреп выборке.

    """
    scores = []

    if isinstance(y_true, pd.Series):
        y_true = y_true.values

    bootstrap_idx = create_bootstrap_samples(y_true)
    for idx in bootstrap_idx:
        y_true_bootstrap = y_true[idx]
        y_pred_bootstrap = y_pred[idx]

        score = metric(y_true_bootstrap, y_pred_bootstrap)
        scores.append(score)

    return scores


def calculate_confidence_interval(scores: list, conf_interval: float = 0.95) -> Tuple[float]:
    """
    Вычисление доверительного интервала.

    Parameters
    ----------
    scores: List[float / int]
        Список с оценками изучаемой величины.

    conf_interval: float, optional, default = 0.95
        Уровень доверия для построения интервала.
        Опциональный параметр, по умолчанию, равен 0.95.

    Returns
    -------
    conf_interval: Tuple[float]
        Кортеж с границами доверительного интервала.

    """
    left_bound = np.percentile(
        scores, ((1 - conf_interval) / 2) * 100
    )
    right_bound = np.percentile(
        scores, (conf_interval + ((1 - conf_interval) / 2)) * 100
    )

    return left_bound, right_bound

In [None]:
def get_input(data_path: str) -> pd.DataFrame:
  """
  Считывание данных и вывод основной информации о наборе данных.

  Parmeters
  ---------
  data_path: str - название файла

  Returns
  -------
  data: pandas.core.frame.DataFrame - загруженный набор данных в pandas.Dataframe
  """
  base_path = "/content"
  data = pd.read_csv(f"{base_path}/{data_path}")
  data.columns = [col.lower() for col in data.columns]
  print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")
  return data

In [None]:
def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catboost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признаков для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.
    
    num_trees: list
        Количество деревьев для каждого estimator'а

    """

    estimators, folds_scores, num_trees = [], [], []
    oof_preds = np.zeros(X.shape[0])
    train_auc = []

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            #eval_set=[(x_train, y_train), (x_valid, y_valid)]
            eval_set=[(x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        train_preds = model.predict_proba(x_train)[:, 1]
        train_auc.append(roc_auc_score(y_train, train_preds))
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)
        num_trees.append(model.tree_count_)

    print(f"Score by each fold: {folds_scores}")
    print(f"Num trees by each model: {num_trees}")
    print("="*65)
    return estimators, oof_preds, num_trees, np.mean(np.array(train_auc))

In [None]:
def lightgbm_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели lightgbm.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    if not categorical:
        categorical = "auto"

    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            #eval_set=[(x_valid, y_valid)],
            #eval_metric="auc", verbose=50, early_stopping_rounds=100,
            categorical_feature=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [None]:
def xgboost_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    encoders: dict
        Список с объектами LabelEncoders.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, encoders = [], {}
    oof_preds = np.zeros(X.shape[0])

    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        dtrain = xgb.DMatrix(x_train, y_train)
        dvalid = xgb.DMatrix(x_valid, y_valid)

        model = xgb.train(
            params=params,
            dtrain=dtrain,
            #maximize=True,
            num_boost_round=params['num_boost_round'],
            #early_stopping_rounds=25,
            #evals=[(dtrain, "train"), (dvalid, "valid")],
            #verbose_eval=10,
        )
        oof_preds[valid_idx] = model.predict(dvalid)
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        estimators.append(model)

    return estimators, encoders, oof_preds

In [None]:
def catboost_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catboost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признаков для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.
    
    num_trees: list
        Количество деревьев для каждого estimator'а

    """

    estimators, folds_scores, num_trees = [], [], []
    oof_preds = np.zeros(X.shape[0])
    train_auc = []
    valid_auc = []

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            #eval_set=[(x_train, y_train), (x_valid, y_valid)]
            #eval_set=[(x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        train_preds = model.predict_proba(x_train)[:, 1]
        valid_preds = model.predict_proba(x_valid)[:, 1]
        train_auc.append(roc_auc_score(y_train, train_preds))
        valid_auc.append(roc_auc_score(y_valid, valid_preds))
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)
        num_trees.append(model.tree_count_)

    print(f"Score by each fold: {folds_scores}")
    print(f"Num trees by each model: {num_trees}")
    print("="*65)
    return estimators, oof_preds, num_trees, np.mean(np.array(train_auc)), np.mean(np.array(valid_auc))

In [None]:
def create_client_profile_features(X: pd.DataFrame, copy:bool=True) -> pd.DataFrame:
  """
  Создание признаков на основе профиля клиентов.

  Parameters
  ----------
  X: pandas.core.frame.DataFrame
    Матрица признаков сисходным профилем клиента.
  
  copy: bool, optional, default = True
    Флаг использования копии датафрейма X.
    Опциональный параметр, по умолчанию равен True.

  Returns
  -------
  X_transformed: pandas.core.frame.DataFrame
    Расширенная матрица признаков с профилем клиентов.
  """
  if copy:
    X = X.copy()

  X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)

  X["external_scoring_prod"] = X["external_scoring_rating_1"]*X["external_scoring_rating_2"]* X["external_scoring_rating_3"]
  X["external_scoring_weight"] = X["external_scoring_rating_1"]*0.5+X["external_scoring_rating_2"]*0.5+X["external_scoring_rating_3"]*0.6
  

  X["ratio_scoring_1_to_scoring_2"] = X["external_scoring_rating_1"] - X["external_scoring_rating_2"]
  X["ratio_scoring_1_to_scoring_3"] = X["external_scoring_rating_1"] - X["external_scoring_rating_3"]
  X["ratio_scoring_2_to_scoring_3"] = X["external_scoring_rating_2"] - X["external_scoring_rating_3"]

  X["ratio_age_to_childrens"] = X["age"]/(X["childrens"]+1)

  
  for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
    feature_name = "external_scoring_rating_{}".format(function_name)
    X[feature_name] = eval("np.{}".format(function_name))(
        X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
    )

  X["external_scoring_rating_diap"] = X["external_scoring_rating_max"] - X["external_scoring_rating_min"]

  # Среднее геометрическое скоров
  #X["external_scoring_rating_geom_mean"] = (X["external_scoring_rating_1"]*X["external_scoring_rating_2"]*X["external_scoring_rating_3"])**(1/3)

  # Среднее гармоническое скоров
  #X["garm_mean"] = 3/(1/X["external_scoring_rating_1"] + 1/X["external_scoring_rating_2"] + 1/X["external_scoring_rating_3"])

  # Отношение между основными фин. показателями
  X["ratio_credit_to_annuity"] = X['amount_credit']/X['amount_annuity']
  X["ratio_annuity_to_salary"] = X['amount_annuity']/X['total_salary']
  X["ratio_credit_to_salary"] = X['amount_credit']/X['total_salary']

  X["ratio_credit_to_annuity_age"] = X['amount_credit']/X['amount_annuity']/X['age']

  # Отношение фин. показателей к возрасту и временным фичам
  X["ratio_annuity_to_age"] = X['amount_annuity']/X['age']
  X["ratio_credit_to_age"] = X['amount_credit']/X['age']
  X["ratio_salary_to_age"] = X['total_salary']/X['age']
  X["ratio_salary_to_experience"] = X['total_salary']/X['days_on_last_job']
  X["ratio_salary_to_experience"] = X['amount_credit']/X['days_on_last_job']
  X["ratio_annuity_to_experience"] = X['amount_annuity']/X['days_on_last_job']

  # Отношение временных признаков
  X["ratio_age_to_experience"] = X['age'] / X["days_on_last_job"]
  X["ratio_salary_to_region_population"] = X['total_salary'] * X["region_population"]
  X["ratio_car_to_experience"] = X['own_car_age'] / X["days_on_last_job"]
  X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

  # Произведение фин. показателей кредита на вероятность дефолта
  X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
  X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
  X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
  X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
  X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
  X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

  #client_profile['TOTAL_SALARY'] = client_profile['TOTAL_SALARY'].apply(lambda x: np.log(x))
  #client_profile['AMOUNT_CREDIT'] = client_profile['AMOUNT_CREDIT'].apply(lambda x: np.log(x))
  #client_profile['AMOUNT_ANNUITY'] = client_profile['AMOUNT_ANNUITY'].apply(lambda x: np.log(x))
  #client_profile['DAYS_ON_LAST_JOB'] = (client_profile['DAYS_ON_LAST_JOB'] + 1).apply(lambda x: np.log(x))

  #train_ext = pd.merge(train_ext, mean_credit_salary, how='left', on='EDUCATION_LEVEL')
  #train_ext['mean_salary'] = train_ext['mean_salary'] - train_ext['TOTAL_SALARY']
  #train_ext['mean_credit'] = train_ext['mean_credit'] - train_ext['AMOUNT_CREDIT']

  #train_ext = pd.merge(train_ext, mean_scoring_by_educ, how='left', on='EDUCATION_LEVEL')
  #train_ext['mean_scoring_1'] = train_ext['mean_scoring_1'] - train_ext['EXTERNAL_SCORING_RATING_1']
  #train_ext['mean_scoring_2'] = train_ext['mean_scoring_2'] - train_ext['EXTERNAL_SCORING_RATING_2']
  #train_ext['mean_scoring_3'] = train_ext['mean_scoring_3'] - train_ext['EXTERNAL_SCORING_RATING_3']

  #mean_credit_salary = client_profile.groupby('EDUCATION_LEVEL').agg(
  #mean_salary = pd.NamedAgg(column = 'TOTAL_SALARY', aggfunc = 'mean'),
  #mean_credit = pd.NamedAgg(column = 'AMOUNT_CREDIT', aggfunc = 'mean'))

  #mean_scoring_by_educ = client_profile.groupby('EDUCATION_LEVEL').agg(
  #mean_scoring_1 = pd.NamedAgg(column = 'EXTERNAL_SCORING_RATING_1', aggfunc = 'mean'),
  #mean_scoring_2 = pd.NamedAgg(column = 'EXTERNAL_SCORING_RATING_2', aggfunc = 'mean'),
  #mean_scoring_3 = pd.NamedAgg(column = 'EXTERNAL_SCORING_RATING_3', aggfunc = 'mean'))

  #data_ext = pd.merge(data_ext, mean_credit_salary, how='left', on='EDUCATION_LEVEL')
  #data_ext['mean_salary'] = data_ext['mean_salary'] - data_ext['TOTAL_SALARY']
  #data_ext['mean_credit'] = data_ext['mean_credit'] - data_ext['AMOUNT_CREDIT']

  #data_ext = pd.merge(data_ext, mean_scoring_by_educ, how='left', on='EDUCATION_LEVEL')
  #data_ext['mean_scoring_1'] = data_ext['mean_scoring_1'] - data_ext['EXTERNAL_SCORING_RATING_1']
  #data_ext['mean_scoring_2'] = data_ext['mean_scoring_2'] - data_ext['EXTERNAL_SCORING_RATING_2']
  #data_ext['mean_scoring_3'] = data_ext['mean_scoring_3'] - data_ext['EXTERNAL_SCORING_RATING_3']

  return X

In [None]:
def greedy_lgbm_selector(data: pd.DataFrame,
                         target: str,
                         lgb_params: dict) -> dict:

  result = {}

  train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=1,
  )

  y_train = train[target]
  x_train = train.drop(columns=target)

  y_valid = valid[target]
  x_valid = valid.drop(columns=target)

  categorical_features = x_valid.select_dtypes(include=[np.object])
  categorical_feature_names = categorical_features.columns.to_list()
  del categorical_features
  for feature in categorical_feature_names:
    x_train[feature] = pd.Series(x_train[feature], dtype="category")
    x_valid[feature] = pd.Series(x_valid[feature], dtype="category")
  
  lgb_train = lgb.Dataset(x_train, y_train)
  lgb_eval = lgb.Dataset(x_valid, y_valid)

  model_lgbm =  lgb.train(lgbm_params,
                lgb_train,
                valid_sets=lgb_eval,
                verbose_eval=False, 
                categorical_feature=categorical_feature_names,
                num_boost_round=10000,
                early_stopping_rounds=100)

  result['initial'] = model_lgbm.best_score['valid_0']['auc']
                       #'best iteration': model_lgbm.best_iteration_}
  
  print("="*50)
  print(f"Initial dataset: best score {model_lgbm.best_score['valid_0']['auc']}, best iteration {model_lgbm.best_iteration}")
  print("="*50)
  
  best_score = model_lgbm.best_score['valid_0']['auc']
  
  for i in range (len(x_train.columns.to_list())):
    feature_to_delete = None
    for feature in x_train.columns.to_list():
      #model_lgbm = lgb.LGBMClassifier(**lgb_params)
      new_cat_features = categorical_feature_names.copy()
      if feature in new_cat_features:
        new_cat_features.remove(feature)
      
      lgb_train = lgb.Dataset(x_train.drop(columns=feature), y_train)
      lgb_eval = lgb.Dataset(x_valid.drop(columns=feature), y_valid)

      model_lgbm =  lgb.train(lgbm_params,
                lgb_train,
                valid_sets=lgb_eval,
                verbose_eval=False, 
                categorical_feature=new_cat_features,
                num_boost_round=10000,
                early_stopping_rounds=100)
      
      if model_lgbm.best_score['valid_0']['auc'] > best_score:
        best_score = model_lgbm.best_score['valid_0']['auc']
        feature_to_delete = feature

      print(f"Dataset without {feature}: best score {model_lgbm.best_score['valid_0']['auc']}, best iteration {model_lgbm.best_iteration}")
    if feature_to_delete != None:
      result[f'without {feature_to_delete}'] = best_score
      x_train.drop(columns=feature_to_delete, inplace=True)
      x_valid.drop(columns=feature_to_delete, inplace=True)
      print("="*50)
      print(f"Dropped {feature_to_delete}, best score {best_score}")
      print("="*50)
    else:
      print("="*50)
      print("No features to delete to improve score")
      break
  return result

## Обучение LGBM

In [None]:
features_to_delete_permut = ['ratio_credit_to_age', 'ratio_annuity_to_age', 'external_scoring_rating_2',
'flag_phone', 'external_scoring_rating_diap','expected_monthly_loss_1','total_salary','ratio_scoring_2_to_scoring_3',
'ratio_annuity_to_salary','expected_total_loss_3','ratio_car_to_experience','application_number','amt_req_credit_bureau_week',
'amt_req_credit_bureau_mon','amt_req_credit_bureau_day','flag_email','childrens','family_size','ratio_scoring_1_to_scoring_3',
'amt_req_credit_bureau_hour','expected_total_loss_2','ratio_credit_to_salary','ratio_salary_to_age','external_scoring_rating_var']

In [None]:
features_to_delete_greedy = ['age', 'amt_req_credit_bureau_year', 'expected_monthly_loss_2', 'expected_total_loss_1',
'external_scoring_rating_max', 'own_car_age', 'region_population']

In [None]:
data = get_input("train.csv")
test = get_input("test.csv")
client_profile = get_input("client_profile.csv")
application_history = get_input("applications_history.csv")

client_profile = create_client_profile_features(client_profile)
data_ext = pd.merge(data, client_profile, how='left', on='application_number')

features_to_delete = set(features_to_delete_permut)
columns_to_learn = list(set(data_ext.columns.to_list()) - features_to_delete - set(features_to_delete_greedy))

data_ext = data_ext[columns_to_learn]

if 'application_number' in columns_to_learn:
  columns_to_learn.remove('application_number')

categorical_features = data_ext.select_dtypes(include=[np.object])
categorical_feature_names = categorical_features.columns.to_list()

for feature in categorical_feature_names:
    data_ext[feature] = pd.Series(data_ext[feature], dtype="category")

train_ext, valid_ext = train_test_split(
    data_ext, train_size=0.7, shuffle=True, random_state=1,
)

train_ext.reset_index(inplace=True, drop=True)
valid_ext.reset_index(inplace=True, drop=True)

y_train = train_ext['target']
x_train = train_ext.drop(columns=['target'])

y_valid = valid_ext['target']
x_valid = valid_ext.drop(columns=['target'])

dtrain = lgb.Dataset(
    data=x_train, label=y_train
)

dvalid = lgb.Dataset(
    data=x_valid, label=y_valid
)

lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "binary",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "num_boost_round ": 1000,
    "reg_lambda": 100,
    "max_depth": 3,
    "n_jobs": -1,
    "seed": 27,
    'device_type': 'cpu'
}

cv_result_lgb_all_not_dummies = lgb.cv(
    params=lgbm_params,
    train_set=dtrain,
    num_boost_round=1000,
    categorical_feature=categorical_feature_names,
    early_stopping_rounds=100,
    verbose_eval=10,
    stratified=True,
    seed=42,
    metrics="auc",
    shuffle=True,
    nfold=7
)

train.csv: shape = 110093 rows, 3 cols
test.csv: shape = 165141 rows, 2 cols
client_profile.csv: shape = 250000 rows, 24 cols
applications_history.csv: shape = 1670214 rows, 26 cols


All-NaN slice encountered
categorical_feature in Dataset is overridden.
New categorical_feature is ['education_level', 'family_status', 'gender', 'name_contract_type']


[10]	cv_agg's auc: 0.696905 + 0.00691063
[20]	cv_agg's auc: 0.703649 + 0.00666409
[30]	cv_agg's auc: 0.711158 + 0.00707587
[40]	cv_agg's auc: 0.71479 + 0.00732237
[50]	cv_agg's auc: 0.717566 + 0.00749504
[60]	cv_agg's auc: 0.719275 + 0.00760309
[70]	cv_agg's auc: 0.720783 + 0.0075849
[80]	cv_agg's auc: 0.721797 + 0.00730984
[90]	cv_agg's auc: 0.72269 + 0.0074417
[100]	cv_agg's auc: 0.723492 + 0.00770132
[110]	cv_agg's auc: 0.724295 + 0.00764219
[120]	cv_agg's auc: 0.724879 + 0.00747699
[130]	cv_agg's auc: 0.725582 + 0.00742351
[140]	cv_agg's auc: 0.726095 + 0.00753005
[150]	cv_agg's auc: 0.726276 + 0.00720334
[160]	cv_agg's auc: 0.727036 + 0.00698766
[170]	cv_agg's auc: 0.727247 + 0.00680673
[180]	cv_agg's auc: 0.72756 + 0.00672556
[190]	cv_agg's auc: 0.727561 + 0.00658946
[200]	cv_agg's auc: 0.728156 + 0.00659139
[210]	cv_agg's auc: 0.728322 + 0.00657838
[220]	cv_agg's auc: 0.728559 + 0.00657713
[230]	cv_agg's auc: 0.728642 + 0.00648552
[240]	cv_agg's auc: 0.72901 + 0.00659195
[250]	c

In [None]:
cv = KFold(n_splits=7, random_state=435, shuffle=True)

lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "binary",
    #"eval_metric": "auc",
    "learning_rate": 0.1,
    "num_boost_round": 270,
    "reg_lambda": 100,
    "max_depth": 3,
    "n_jobs": -1,
    "seed": 27,
    'device_type': 'cpu'
}

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, categorical=categorical_feature_names
)

print(f"Out of fold ROC AUC { metrics.roc_auc_score(y_train, lgb_oof)}")

Sat Apr 24 05:48:03 2021, Cross-Validation, 77065 rows, 25 cols


Found `num_boost_round` in params. Will use it instead of argument
categorical_feature in Dataset is overridden.
New categorical_feature is ['education_level', 'family_status', 'gender', 'name_contract_type']


Fold 1, Valid score = 0.72902


Found `num_boost_round` in params. Will use it instead of argument
categorical_feature in Dataset is overridden.
New categorical_feature is ['education_level', 'family_status', 'gender', 'name_contract_type']


Fold 2, Valid score = 0.70809


Found `num_boost_round` in params. Will use it instead of argument
categorical_feature in Dataset is overridden.
New categorical_feature is ['education_level', 'family_status', 'gender', 'name_contract_type']


Fold 3, Valid score = 0.73072


Found `num_boost_round` in params. Will use it instead of argument
categorical_feature in Dataset is overridden.
New categorical_feature is ['education_level', 'family_status', 'gender', 'name_contract_type']


Fold 4, Valid score = 0.73035


Found `num_boost_round` in params. Will use it instead of argument
categorical_feature in Dataset is overridden.
New categorical_feature is ['education_level', 'family_status', 'gender', 'name_contract_type']


Fold 5, Valid score = 0.71788


Found `num_boost_round` in params. Will use it instead of argument
categorical_feature in Dataset is overridden.
New categorical_feature is ['education_level', 'family_status', 'gender', 'name_contract_type']


Fold 6, Valid score = 0.73257


Found `num_boost_round` in params. Will use it instead of argument
categorical_feature in Dataset is overridden.
New categorical_feature is ['education_level', 'family_status', 'gender', 'name_contract_type']


Fold 7, Valid score = 0.75041
Score by each fold: [0.72902, 0.70809, 0.73072, 0.73035, 0.71788, 0.73257, 0.75041]
Out of fold ROC AUC 0.7274452556929765


In [None]:
result_lgb = np.zeros((7, x_valid.shape[0]))
for i, estimator in enumerate(lgb_estimators):
  result_lgb[i] = estimator.predict_proba(x_valid)[:,1]

result_lgb_mean = np.mean(result_lgb, axis=0)
result_lgb_max = np.max(result_lgb, axis=0)
result_lgb_min = np.min(result_lgb, axis=0)
result_lgb_gmean = (result_lgb[0]*result_lgb[1]*result_lgb[2]*result_lgb[3]*result_lgb[4]*result_lgb[5]*result_lgb[6]) ** (1/7)

print(f"result_lgb_mean: {metrics.roc_auc_score(y_valid, result_lgb_mean)}")
print(f"result_lgb_min: {metrics.roc_auc_score(y_valid, result_lgb_min)}")
print(f"result_lgb_max: {metrics.roc_auc_score(y_valid, result_lgb_max)}")
print(f"result_lgb_gmean: {metrics.roc_auc_score(y_valid, result_lgb_gmean)}")

result_lgb_mean: 0.7303782304464836
result_lgb_min: 0.7288965135608407
result_lgb_max: 0.7294332586273377
result_lgb_gmean: 0.7303275207151326


## Обучение catboost

In [None]:
data_ext[categorical_feature_names] = data_ext[categorical_feature_names].astype(str)


y_train = data_ext['target']
x_train = data_ext.drop(columns=['target'])

train_ext, valid_ext = train_test_split(
    data_ext, train_size=0.7, shuffle=True, random_state=1,
)

train_ext.reset_index(inplace=True, drop=True)
valid_ext.reset_index(inplace=True, drop=True)

y_train = train_ext['target']
x_train = train_ext.drop(columns=['target'])

y_valid = valid_ext['target']
x_valid = valid_ext.drop(columns=['target'])

cb_params = {
    "n_estimators": 1000,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 3,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 100,
    "thread_count": 10,
    "random_seed": 27,
    'learning_rate': 0.1,
    #'task_type': 'GPU',
    'one_hot_max_size': 5
}

cv_result = cb.cv(
    pool = cb.Pool(x_train, y_train, cat_features=categorical_feature_names),
    params = cb_params,
    plot=True,
    shuffle = True,
    stratified = True,
    seed = 42,
    iterations = 10000,
    early_stopping_rounds = 100,
    fold_count = 7,
    as_pandas = True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.5817029	best: 0.5817029 (0)
10:	test: 0.6935256	best: 0.6935832 (9)	total: 5.47s	remaining: 1h 22m 44s
20:	test: 0.7009810	best: 0.7009810 (20)
30:	test: 0.7073673	best: 0.7073673 (30)
40:	test: 0.7110238	best: 0.7110238 (40)
50:	test: 0.7136415	best: 0.7136415 (50)
60:	test: 0.7149013	best: 0.7149013 (60)
70:	test: 0.7160810	best: 0.7160810 (70)
80:	test: 0.7173489	best: 0.7173489 (80)	total: 35.3s	remaining: 1h 11m 59s
90:	test: 0.7178881	best: 0.7178951 (89)
100:	test: 0.7190680	best: 0.7190680 (100)
110:	test: 0.7196277	best: 0.7196277 (110)
120:	test: 0.7201358	best: 0.7201358 (120)
130:	test: 0.7210059	best: 0.7210059 (130)
140:	test: 0.7212570	best: 0.7212570 (140)
150:	test: 0.7216957	best: 0.7216957 (150)
160:	test: 0.7223092	best: 0.7223092 (160)
170:	test: 0.7226693	best: 0.7226693 (170)
180:	test: 0.7230187	best: 0.7230671 (179)
190:	test: 0.7231864	best: 0.7233079 (188)
200:	test: 0.7234818	best: 0.7235027 (199)
210:	test: 0.7237398	best: 0.7237398 (210)
220:	te

In [None]:
# 580 на 12 отброшенных от permutation и исходных признаках
cb_params = {
    "n_estimators": 417, #635,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 25,
    "max_depth": 3,
    "l2_leaf_reg": 100,
    "thread_count": 10,
    "random_seed": 27,
    'learning_rate': 0.1,
    #'task_type': 'GPU',
    'one_hot_max_size': 5
}

cb_estimators, cb_oof, cb_num_trees, train_auc, valid_auc = catboost_cv_fit(
    cb_params, x_train, y_train, cv, categorical=categorical_feature_names
)

print(f"Mean train ROC AUC {train_auc}")
print(f"Mean valid ROC AUC {valid_auc}")
print(f"Out of fold ROC AUC { metrics.roc_auc_score(y_train, cb_oof)}")

Sat Apr 24 05:53:54 2021, Cross-Validation, 77065 rows, 25 cols
0:	total: 96.7ms	remaining: 40.2s
25:	total: 2.46s	remaining: 37s
50:	total: 4.65s	remaining: 33.4s
75:	total: 7.06s	remaining: 31.7s
100:	total: 9.4s	remaining: 29.4s
125:	total: 11.8s	remaining: 27.4s
150:	total: 14s	remaining: 24.7s
175:	total: 16.2s	remaining: 22.1s
200:	total: 18.5s	remaining: 19.9s
225:	total: 20.7s	remaining: 17.5s
250:	total: 22.9s	remaining: 15.2s
275:	total: 25.1s	remaining: 12.8s
300:	total: 27.3s	remaining: 10.5s
325:	total: 29.4s	remaining: 8.21s
350:	total: 31.6s	remaining: 5.95s
375:	total: 34s	remaining: 3.7s
400:	total: 36.2s	remaining: 1.44s
416:	total: 37.6s	remaining: 0us
Fold 1, Valid score = 0.72185
0:	total: 97.5ms	remaining: 40.6s
25:	total: 2.42s	remaining: 36.3s
50:	total: 4.7s	remaining: 33.7s
75:	total: 6.94s	remaining: 31.1s
100:	total: 9.13s	remaining: 28.6s
125:	total: 11.5s	remaining: 26.5s
150:	total: 13.9s	remaining: 24.5s
175:	total: 16.2s	remaining: 22.2s
200:	total: 18.

In [None]:
result_catb = np.zeros((7, valid_ext.shape[0]))
for i, estimator in enumerate(cb_estimators):
  result_catb[i] = estimator.predict_proba(valid_ext)[:,1]

result_catb_mean = np.mean(result_catb, axis=0)
result_catb_max = np.max(result_catb, axis=0)
result_catb_min = np.min(result_catb, axis=0)
result_catb_gmean = (result_catb[0]*result_catb[1]*result_catb[2]*result_catb[3]*result_catb[4]*result_catb[5]*result_catb[6]) ** (1/7)

print(f"result_catb_mean: {metrics.roc_auc_score(y_valid, result_catb_mean)}")
print(f"result_catb_min: {metrics.roc_auc_score(y_valid, result_catb_min)}")
print(f"result_catb_max: {metrics.roc_auc_score(y_valid, result_catb_max)}")
print(f"result_catb_gmean: {metrics.roc_auc_score(y_valid, result_catb_gmean)}")

CatBoostError: ignored

## Обучение xgboost

In [None]:
data_ext_dummies = pd.get_dummies(data_ext)


train_ext, valid_ext = train_test_split(
    data_ext_dummies, train_size=0.7, shuffle=True, random_state=1,
)

train_ext.reset_index(inplace=True, drop=True)
valid_ext.reset_index(inplace=True, drop=True)

y_train = train_ext['target']
x_train = train_ext.drop(columns=['target'])

y_valid = valid_ext['target']
x_valid = valid_ext.drop(columns=['target'])

dtrain = xgb.DMatrix(
    data=x_train, label=y_train
)

dvalid = xgb.DMatrix(
    data=x_valid, label=y_valid
)

xgb_params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "eval_metric": "auc",
    "eta": "0.1",
    "max_depth": 3,
    "gamma": 10,
    "subsample": 0.85,
    "colsample_bytree": 0.7,
    "colsample_bylevel": 0.632,
    "min_child_weight": 30,
    "alpha": 0,
    "lambda": 0,
    "nthread": 10,
    "random_seed": 42
}

cv_result_xgb = xgb.cv(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=2000,
    #categorical_feature=categorical_feature_names,
    early_stopping_rounds=100,
    verbose_eval=10,
    stratified=True,
    seed=42,
    metrics="auc",
    shuffle=True,
    nfold=7,
    #maximaze=True
)

[0]	train-auc:0.647359+0.00420795	test-auc:0.641877+0.00759906
[10]	train-auc:0.702856+0.00190008	test-auc:0.697337+0.00736938
[20]	train-auc:0.706565+0.00126672	test-auc:0.700674+0.00772146
[30]	train-auc:0.711756+0.00102757	test-auc:0.704859+0.00730126
[40]	train-auc:0.719267+0.00113465	test-auc:0.710985+0.00733271
[50]	train-auc:0.725029+0.00135899	test-auc:0.715775+0.00723661
[60]	train-auc:0.728633+0.000927736	test-auc:0.718555+0.00752021
[70]	train-auc:0.731138+0.000835366	test-auc:0.72019+0.00752172
[80]	train-auc:0.732971+0.00112364	test-auc:0.721289+0.00739633
[90]	train-auc:0.734204+0.00115002	test-auc:0.721812+0.00744725
[100]	train-auc:0.735168+0.000981529	test-auc:0.722334+0.00770026
[110]	train-auc:0.736062+0.000825041	test-auc:0.72275+0.00778711
[120]	train-auc:0.736878+0.000767785	test-auc:0.722831+0.00761624
[130]	train-auc:0.737788+0.00109061	test-auc:0.723499+0.00736592
[140]	train-auc:0.738535+0.0011479	test-auc:0.723911+0.00716405
[150]	train-auc:0.739345+0.0011357

In [None]:
xgb_params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    #"eval_metric": "auc",
    "eta": "0.1",
    "max_depth": 3,
    "gamma": 10,
    'num_boost_round': 630,
    "subsample": 0.85,
    "colsample_bytree": 0.7,
    "colsample_bylevel": 0.632,
    "min_child_weight": 30,
    "alpha": 0,
    "lambda": 0,
    "nthread": 10,
    "random_seed": 42
}

xgb_estimators, encoders, xgb_oof = xgboost_cv_fit(
    xgb_params, x_train, y_train, cv
)

print(f"Out of fold ROC AUC { metrics.roc_auc_score(y_train, xgb_oof)}")

Sat Apr 24 06:24:10 2021, Cross-Validation, 77065 rows, 40 cols
Fold 1, Valid score = 0.72578
Fold 2, Valid score = 0.7084
Fold 3, Valid score = 0.73627
Fold 4, Valid score = 0.7325
Fold 5, Valid score = 0.7193
Fold 6, Valid score = 0.73223
Fold 7, Valid score = 0.74924
Out of fold ROC AUC 0.7281289682937855


In [None]:
result_xgb = np.zeros((7, x_valid.shape[0]))
for i, estimator in enumerate(xgb_estimators):
  result_xgb[i] = estimator.predict(dvalid)

result_xgb_mean = np.mean(result_xgb, axis=0)
result_xgb_max = np.max(result_xgb, axis=0)
result_xgb_min = np.min(result_xgb, axis=0)
result_xgb_gmean = (result_xgb[0]*result_xgb[1]*result_xgb[2]*result_xgb[3]*result_xgb[4]*result_xgb[5]*result_xgb[6]) ** (1/7)

print(f"result_xgb_mean: {metrics.roc_auc_score(y_valid, result_xgb_mean)}")
print(f"result_xgb_min: {metrics.roc_auc_score(y_valid, result_xgb_min)}")
print(f"result_xgb_max: {metrics.roc_auc_score(y_valid, result_xgb_max)}")
print(f"result_xgb_gmean: {metrics.roc_auc_score(y_valid, result_xgb_gmean)}")

result_xgb_mean: 0.7293085026877376
result_xgb_min: 0.7279202170796033
result_xgb_max: 0.7285136050867593
result_xgb_gmean: 0.729314984703662


In [None]:
result_mean = (result_lgb_mean + result_xgb_mean)/2

In [None]:
metrics.roc_auc_score(y_valid, result_mean)

0.7301223889976421

In [None]:
result_gmean = (result_lgb_gmean * result_xgb_gmean)**0.5

In [None]:
metrics.roc_auc_score(y_valid, result_gmean)

0.7302432849973736

In [None]:
result_xgb.shape

(7, 33028)

In [None]:
result_lgb.shape

(7, 33028)

In [None]:
result_all = np.vstack((result_lgb, result_xgb))

In [None]:
result_all.shape

(14, 33028)

In [None]:
result_all_mean = np.mean(result_all, axis=0)
result_all_max = np.max(result_all, axis=0)
result_all_min = np.min(result_all, axis=0)
result_all_gmean = (result_all[0]*result_all[1]*result_all[2]*result_all[3]*result_all[4]*result_all[5]*result_all[6]*result_all[7]*result_all[8]*result_all[9]*result_all[10]*result_all[11]*result_all[12]*result_all[13]) ** (1/14)

print(f"result_xgb_mean: {metrics.roc_auc_score(y_valid, result_all_mean)}")
print(f"result_xgb_min: {metrics.roc_auc_score(y_valid, result_all_min)}")
print(f"result_xgb_max: {metrics.roc_auc_score(y_valid, result_all_max)}")
print(f"result_xgb_gmean: {metrics.roc_auc_score(y_valid, result_all_gmean)}")

result_xgb_mean: 0.7301223889976421
result_xgb_min: 0.7285127269961731
result_xgb_max: 0.7284484007628851
result_xgb_gmean: 0.7302432849973736
