In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import xgboost
import optuna
from optuna.samplers import TPESampler
import joblib
from joblib import dump, load
import sklearn.metrics
from sklearn.metrics import accuracy_score
import gc
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans

In [None]:
train_flag = True #Если нужно тренировать модели, то флаг на 'True', если загрузить тренированные, то 'False'

# 1. Загрузка датасета

In [None]:
data = pd.read_csv(r"D:\for ML\HW-hack\train_hw.csv", sep=';')

# 2. Обработка данных

### 2.1 Выбор признаков

In [None]:
data_org = data[['WELL', 'DEPTH_MD', 'CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE']]
data_all = data
data_log = data.drop(['DEPTH_MD','X_LOC','Y_LOC','Z_LOC','GROUP','FORMATION','MUDWEIGHT'], axis=1)
data_slf = data[['WELL', 'FORMATION', 'DEPTH_MD', 'DRHO','DTC', 'SP', 'GR', 'RDEP', 'FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE']]

### 2.2 Восстановление (импутация) значений

####  2.2.1 Разбиение на группы по скважинам

In [None]:
gb = data_org.groupby("WELL")
well_org_fill_md = [gb.get_group(x) for x in gb.groups]

gb = data_all.groupby("WELL")
well_all_fill_md = [gb.get_group(x) for x in gb.groups]

gb = data_log.groupby("WELL")
well_log_fill_md = [gb.get_group(x) for x in gb.groups]

gb = data_slf.groupby("WELL")
well_slf_fill_md = [gb.get_group(x) for x in gb.groups]

#### 2.2.2 Импутация значений

In [None]:
#Org
d = data_org.groupby("FORCE_2020_LITHOFACIES_LITHOLOGY").mean()
for i in d.columns:
    d[i] = d[i].fillna(d[i].mean())

for i in range(len(well_org_fill_md)):
    for j in set(well_org_fill_md[i].columns)-set(['FORCE_2020_LITHOFACIES_CONFIDENCE','FORCE_2020_LITHOFACIES_LITHOLOGY', 'WELL', 'GROUP', 'FORMATION']):
        if well_org_fill_md[i][j].isna().sum() != len(well_org_fill_md[i][j]):
            well_org_fill_md[i].loc[well_org_fill_md[i][j].isna(), j] = well_org_fill_md[i][j].mode().iloc[0]
        else:
            for t in [30000, 65000, 65030, 70000, 70032, 74000, 80000, 86000, 88000, 90000, 93000, 99000]:
                well_org_fill_md[i].loc[well_org_fill_md[i]['FORCE_2020_LITHOFACIES_LITHOLOGY'] == t, j]  = d.loc[t, j]

#All
d = data_all.groupby("FORCE_2020_LITHOFACIES_LITHOLOGY").mean()
for i in d.columns:
    d[i] = d[i].fillna(d[i].mean())

for i in range(len(well_all_fill_md)):
    values = {'GROUP': well_all_fill_md[i]['GROUP'].mode().iloc[0], 'FORMATION': well_all_fill_md[i]['FORMATION'].mode().iloc[0]}
    well_all_fill_md[i] = well_all_fill_md[i].fillna(value=values)
    for j in set(well_all_fill_md[i].columns)-set(['FORCE_2020_LITHOFACIES_CONFIDENCE','FORCE_2020_LITHOFACIES_LITHOLOGY', 'WELL', 'GROUP', 'FORMATION']):
        if well_all_fill_md[i][j].isna().sum() != len(well_all_fill_md[i][j]):
            well_all_fill_md[i].loc[well_all_fill_md[i][j].isna(), j] = well_all_fill_md[i][j].mode().iloc[0]
        else:
            for t in [30000, 65000, 65030, 70000, 70032, 74000, 80000, 86000, 88000, 90000, 93000, 99000]:
                well_all_fill_md[i].loc[well_all_fill_md[i]['FORCE_2020_LITHOFACIES_LITHOLOGY'] == t, j]  = d.loc[t, j]

#Log
d = data_log.groupby("FORCE_2020_LITHOFACIES_LITHOLOGY").mean()
for i in d.columns:
    d[i] = d[i].fillna(d[i].mean())

for i in range(len(well_log_fill_md)):
    for j in set(well_log_fill_md[i].columns)-set(['FORCE_2020_LITHOFACIES_CONFIDENCE','FORCE_2020_LITHOFACIES_LITHOLOGY', 'WELL', 'GROUP', 'FORMATION']):
        if well_log_fill_md[i][j].isna().sum() != len(well_log_fill_md[i][j]):
            well_log_fill_md[i].loc[well_log_fill_md[i][j].isna(), j] = well_log_fill_md[i][j].mode().iloc[0]
        else:
            for t in [30000, 65000, 65030, 70000, 70032, 74000, 80000, 86000, 88000, 90000, 93000, 99000]:
                well_log_fill_md[i].loc[well_log_fill_md[i]['FORCE_2020_LITHOFACIES_LITHOLOGY'] == t, j]  = d.loc[t, j]

#Slf
d = data_slf.groupby("FORCE_2020_LITHOFACIES_LITHOLOGY").mean()
for i in d.columns:
    d[i] = d[i].fillna(d[i].mean())

for i in range(len(well_slf_fill_md)):
    values = {'FORMATION': well_slf_fill_md[i]['FORMATION'].mode().iloc[0]}
    well_slf_fill_md[i] = well_slf_fill_md[i].fillna(value=values)
    for j in set(well_slf_fill_md[i].columns)-set(['FORCE_2020_LITHOFACIES_CONFIDENCE','FORCE_2020_LITHOFACIES_LITHOLOGY', 'WELL', 'GROUP', 'FORMATION']):
        if well_slf_fill_md[i][j].isna().sum() != len(well_slf_fill_md[i][j]):
            well_slf_fill_md[i].loc[well_slf_fill_md[i][j].isna(), j] = well_slf_fill_md[i][j].mode().iloc[0]
        else:
            for t in [30000, 65000, 65030, 70000, 70032, 74000, 80000, 86000, 88000, 90000, 93000, 99000]:
                well_slf_fill_md[i].loc[well_slf_fill_md[i]['FORCE_2020_LITHOFACIES_LITHOLOGY'] == t, j]  = d.loc[t, j]

### 2.3 Подготовка данных к обучению

#### 2.3.1 Скалирование данных

In [None]:
#Глубина не скалируется
scaler = RobustScaler()
for i in range(len(well_org_fill_md)):
    well_org_fill_md[i][['CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR','NPHI', 'PEF', 'DTC', 'SP', 'BS']] = scaler.fit_transform(well_org_fill_md[i][['CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR','NPHI', 'PEF', 'DTC', 'SP', 'BS']])

for i in range(len(well_all_fill_md)):
    well_all_fill_md[i][['X_LOC', 'Y_LOC', 'Z_LOC','CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI', 'PEF',
       'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'MUDWEIGHT', 'RMIC',
       'ROPA', 'RXO']] = scaler.fit_transform(well_all_fill_md[i][['X_LOC', 'Y_LOC', 'Z_LOC','CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI', 'PEF',
       'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'MUDWEIGHT', 'RMIC',
       'ROPA', 'RXO']])

for i in range(len(well_log_fill_md)):
    well_log_fill_md[i][['CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI',
       'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'RMIC', 'ROPA',
       'RXO']] = scaler.fit_transform(well_log_fill_md[i][['CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI',
       'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'RMIC', 'ROPA',
       'RXO']])

for i in range(len(well_slf_fill_md)):
    well_slf_fill_md[i][['DRHO', 'DTC', 'SP', 'GR', 'RDEP']] = scaler.fit_transform(well_slf_fill_md[i][['DRHO', 'DTC', 'SP', 'GR', 'RDEP']])

#### 2.3.2 Объединение индивидуальных скважинных данных

In [None]:
data_org_fill_md = pd.concat(well_org_fill_md)
data_all_fill_md = pd.concat(well_all_fill_md)
data_log_fill_md = pd.concat(well_log_fill_md)
data_slf_fill_md = pd.concat(well_slf_fill_md)

#### 2.3.3 Разбиение  на фичи и таргет + работа с таргетом

In [None]:
X_org_md = data_org_fill_md.drop(['WELL', 'FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)
X_all_md = data_all_fill_md.drop(['FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)
X_log_md = data_log_fill_md.drop(['WELL', 'FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)
X_slf_md = data_slf_fill_md.drop(['FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)

y_org_md = data_org_fill_md['FORCE_2020_LITHOFACIES_LITHOLOGY']
y_all_md = data_all_fill_md['FORCE_2020_LITHOFACIES_LITHOLOGY']
y_log_md = data_log_fill_md['FORCE_2020_LITHOFACIES_LITHOLOGY']
y_slf_md = data_slf_fill_md['FORCE_2020_LITHOFACIES_LITHOLOGY']

In [None]:
lithology_numbers = {30000: 0,
                 65030: 1,
                 65000: 2,
                 80000: 3,
                 74000: 4,
                 70000: 5,
                 70032: 6,
                 88000: 7,
                 86000: 8,
                 99000: 9,
                 90000: 10,
                 93000: 11}

In [None]:
y_org_md = y_org_md.map(lithology_numbers)
y_all_md = y_all_md.map(lithology_numbers)
y_log_md = y_log_md.map(lithology_numbers)
y_slf_md = y_slf_md.map(lithology_numbers)

#### 2.3.4 Разбиение на тренировочный и тестовый сеты

In [None]:
X_train_org_md, X_test_org_md, y_train_org_md, y_test_org_md = train_test_split(X_org_md, y_org_md, train_size=0.70, random_state=42, stratify=y_org_md)
X_train_all_md, X_test_all_md, y_train_all_md, y_test_all_md = train_test_split(X_all_md, y_all_md, train_size=0.70, random_state=42, stratify=y_all_md)
X_train_log_md, X_test_log_md, y_train_log_md, y_test_log_md = train_test_split(X_log_md, y_log_md, train_size=0.70, random_state=42, stratify=y_log_md)
X_train_slf_md, X_test_slf_md, y_train_slf_md, y_test_slf_md = train_test_split(X_slf_md, y_slf_md, train_size=0.70, random_state=42, stratify=y_slf_md)

# 3. Обучение

### 3.1 Подбор гиперпараметров

In [None]:
if train_flag == True:
    def objective_xgb(trial):
        X_train, X_test, y_train, y_test = train_test_split(X_org_md, y_org_md, train_size=0.70, random_state=42, stratify=y_org_md)
        dtrain = xgboost.DMatrix(X_train, label=y_train)
        dvalid = xgboost.DMatrix(X_test, label=y_test)

        param = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 500, 50),
            "num_class": 12,
            "verbosity": 0,
            "objective": "multi:softmax",
            "tree_method": "gpu_hist",
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 1.0, log=False),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 1.0, log=False),
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        }

        if param["booster"] in ["gbtree", "dart"]:
            param["max_depth"] = trial.suggest_int("max_depth", 6, 16, step=1)
            param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 8, step=1)
            param["learning_rate"] = trial.suggest_float("learning_rate", 1e-3, 1.0, log=False)
            param["gamma"] = trial.suggest_float("gamma", 1e-8, 1e-5, log=False)
            param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

        if param["booster"] == "dart":
            param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1e-5, log=False)
            param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1e-5, log=False)

        num_round = 100
        bst = xgboost.train(param, dtrain, num_round)
        preds = bst.predict(dvalid)
        accuracy = sklearn.metrics.accuracy_score(y_test, preds)
        return accuracy

In [None]:
if train_flag == True:
    def objective_cb(trial):
        X_train, X_test, y_train, y_test = train_test_split(X_org_md, y_org_md, train_size=0.70, random_state=42, stratify=y_org_md)

        param = {
            "iterations": 30,
            "loss_function": "MultiClass",
            "task_type": "GPU",
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.8),
            "depth": trial.suggest_int("depth", 10, 16),
            "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli"]),
            "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise"])
        }

        cst = CatBoostClassifier(**param)
        cst.fit(X_train, y_train, verbose=False, plot=False)
        preds = cst.predict(X_test)
        accuracy = sklearn.metrics.accuracy_score(y_test, preds)
        return accuracy

In [None]:
if train_flag == True:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_xgb, n_trials=1000, timeout=1600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
if train_flag == True:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_cb, n_trials=1000, timeout=1600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

### 3.2 Настройка моделей

In [None]:
A=np.array([[0.    , 2.    , 3.5   , 3.    , 3.75  , 3.5   , 3.5   , 4.    , 4.    , 2.5   , 3.875 , 3.25  ],
            [2.    , 0.    , 2.375 , 2.75  , 4.    , 3.75  , 3.75  , 3.875 , 4.    , 3.    , 3.75  , 3.    ],
            [3.5   , 2.375 , 0.    , 2.    , 3.5   , 3.5   , 3.75  , 4.    , 4.    , 2.75  , 3.25  , 3.    ],
            [3.    , 2.75  , 2.    , 0.    , 2.5   , 2.    , 2.25  , 4.    , 4.    , 3.375 , 3.75  , 3.25  ],
            [3.75  , 4.    , 3.5   , 2.5   , 0.    , 2.625 , 2.875 , 3.75  , 3.25  , 3.    , 4.    , 3.625 ],
            [3.5   , 3.75  , 3.5   , 2.    , 2.625 , 0.    , 1.375 , 4.    , 3.75  , 3.5   , 4.    , 3.625 ],
            [3.5   , 3.75  , 3.75  , 2.25  , 2.875 , 1.375 , 0.    , 4.    , 3.75  , 3.125 , 4.    , 3.75  ],
            [4.    , 3.875 , 4.    , 4.    , 3.75  , 4.    , 4.    , 0.    , 2.75  , 3.75  , 3.75  , 4.    ],
            [4.    , 4.    , 4.    , 4.    , 3.25  , 3.75  , 3.75  , 2.75  , 0.    , 4.    , 4.    , 3.875 ],
            [2.5   , 3.    , 2.75  , 3.375 , 3.    , 3.5   , 3.125 , 3.75  , 4.    , 0.    , 2.5   , 3.25  ],
            [3.875 , 3.75  , 3.25  , 3.75  , 4.    , 4.    , 4.    , 3.75  , 4.    , 2.5   , 0.    , 4.    ],
            [3.25  , 3.    , 3.    , 3.25  , 3.625 , 3.625 , 3.75  , 4.    , 3.875 , 3.25  , 4.    , 0.    ]])

In [None]:
#Проверка гипотезы о происхождение матрицы весов
if train_flag == True:
    lm = data_log.drop(['FORCE_2020_LITHOFACIES_CONFIDENCE','WELL'], axis=1).groupby('FORCE_2020_LITHOFACIES_LITHOLOGY').mean().loc[:, :]
    #lm = data_org.drop(['WELL', 'FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1).groupby('FORCE_2020_LITHOFACIES_LITHOLOGY').mean().loc[:, :]
    #lm = data_all.drop(['WELL','FORMATION', 'GROUP', 'FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1).groupby('FORCE_2020_LITHOFACIES_LITHOLOGY').mean().loc[:, :]
    fd = np.zeros((12, 12), dtype="float64")
    for i, l1 in enumerate([30000,65030,65000,80000,74000,70000,70032,88000,86000,99000,90000,93000]):
        for j, l2 in enumerate([30000,65030,65000,80000,74000,70000,70032,88000,86000,99000,90000,93000]):
            fd[i, j] = np.sqrt(sum(pow(a-b, 2) for a, b in zip(lm.loc[l1, :].values, lm.loc[l2, :].values)))

    np.set_printoptions(linewidth=120)
    print(np.round_(fd/16.94412535, decimals=3)) #расчёт расстояния по данным ГИС
    #print(np.round_(fd/111.98208369, decimals=3)) #расчёт расстояния по данным организаторов
    #print(np.round_(fd/31168.29385083, decimals=3)) #расчёт расстояния по всем данным

In [None]:
#Рассчёт весов классов на основе матрицы в метрике оценки точности
if train_flag == True:
    classes = np.unique(y_train_org_md)
    A_weights = np.ones(len(classes))
    y_train_weights = np.ones(len(y_train_org_md))
    for i in classes:
        A_weights[i] = A[i].mean()
    class_weights = dict(zip(classes, A_weights))

In [None]:
if train_flag == True:
    #Org
    y_train_org_md_weights = np.zeros(max(y_train_org_md.index)+1)
    for i in y_train_org_md.index:
        y_train_org_md_weights[i] = A_weights[y_train_org_md[i]]
    y_train_org_md_weights = y_train_org_md_weights[y_train_org_md_weights!=0.0]

    #All
    y_train_all_md_weights = np.zeros(max(y_train_all_md.index)+1)
    for i in y_train_all_md.index:
        y_train_all_md_weights[i] = A_weights[y_train_all_md[i]]
    y_train_all_md_weights = y_train_all_md_weights[y_train_all_md_weights!=0.0]

    #Log
    y_train_log_md_weights = np.zeros(max(y_train_log_md.index)+1)
    for i in y_train_log_md.index:
        y_train_log_md_weights[i] = A_weights[y_train_log_md[i]]

    #Slf
    y_train_slf_md_weights = np.zeros(max(y_train_slf_md.index)+1)
    for i in y_train_slf_md.index:
        y_train_slf_md_weights[i] = A_weights[y_train_slf_md[i]]
    y_train_slf_md_weights = y_train_slf_md_weights[y_train_slf_md_weights!=0.0]

In [None]:
if train_flag == True:
    model_rf = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced', n_jobs=8)

In [None]:
if train_flag == True:
    param = {
        'loss_function': 'MultiClass',
        'eval_metric': 'MultiClass',
        'early_stopping_rounds': 20,
        'task_type': 'GPU',
        'nan_mode': "Forbidden",
        'random_seed': 42,
        'class_weights': class_weights,
        'iterations': 2500,
        'learning_rate': 0.23714547752509413,
        'depth': 15, #11
        'bootstrap_type': 'Bernoulli',
        'grow_policy': 'Depthwise'
    }
    
    model_cb = CatBoostClassifier(**param)

### 3.3 Обучение моделей

#### 3.3.1 CatBoost

In [None]:
if train_flag == True:
    cst = model_cb.fit(X_train_org_md, y_train_org_md, verbose=False, plot=True)
    cst.save_model('cst_model_org_md.dat')
    y_pred_model_cb_org_md = cst.predict(X_test_org_md)

    cat_features = ['WELL', 'GROUP', 'FORMATION']
    cst = model_cb.fit(X_train_all_md, y_train_all_md, verbose=False, plot=True, cat_features=cat_features)
    cst.save_model('cst_model_all_md.dat')
    y_pred_model_cb_all_md = cst.predict(X_test_all_md)
    cst.get_feature_importance()

    cst = model_cb.fit(X_train_log_md, y_train_log_md, verbose=False, plot=True)
    cst.save_model('cst_model_log_md.dat')
    y_pred_model_cb_log_md = cst.predict(X_test_log_md)

    cat_features = ['WELL', 'FORMATION']
    cst = model_cb.fit(X_train_slf_md, y_train_slf_md, verbose=False, plot=True, cat_features=cat_features)
    cst.save_model('cst_model_slf_md.dat')
    y_pred_model_cb_slf_md = cst.predict(X_test_slf_md)

#### 3.3.2 XGBoost

In [None]:
if train_flag == True:
    #Проблема с освобождением видеопамяти уже решена
    param = {
        'tree_method': 'gpu_hist',
        'num_class': 12,
        'verbosity': 0,
        'objective': 'multi:softmax',
        'eval_metric': 'mlogloss',
        'n_estimators': 300,
        'booster': 'gbtree',
        'reg_lambda': 0.7782570197655274,
        'reg_alpha': 0.6708911140908719,
        'subsample': 0.4074545609407748,
        'colsample_bytree': 0.9676511295252206,
        'max_depth': 10,
        'min_child_weight': 4,
        'learning_rate': 0.2781337326653397,
        'gamma': 4.6464337473120594e-06,
        'grow_policy': 'lossguide'}
    num_round = 3000
    
    #Org
    dtrain = xgboost.DMatrix(X_train_org_md, label=y_train_org_md)
    dtest = xgboost.DMatrix(X_test_org_md, label=y_test_org_md)
    bst = xgboost.train(param, dtrain, num_round, evals=[(dtrain, 'train') , (dtest, 'test')], early_stopping_rounds=10)
    joblib.dump(bst, 'xgb_model_org_md.dat')
    bst.__del__() # удаление модели с её взаимосвязями для очистки графической памяти
    bst = joblib.load('xgb_model_org_md.dat')
    y_pred_model_xgb_org_md = bst.predict(dtest)
    bst.__del__() # удаление модели с её взаимосвязями для очистки графической памяти
    gc.collect()
    print(y_pred_model_xgb_org_md)

    #All
    X_train_all_md[['WELL', 'GROUP', 'FORMATION']] = X_train_all_md[['WELL', 'GROUP', 'FORMATION']].astype('category')
    X_test_all_md[['WELL', 'GROUP', 'FORMATION']] = X_test_all_md[['WELL', 'GROUP', 'FORMATION']].astype('category')
    dtrain = xgboost.DMatrix(X_train_all_md, label=y_train_all_md, enable_categorical=True)
    dtest = xgboost.DMatrix(X_test_all_md, label=y_test_all_md, enable_categorical=True)
    bst = xgboost.train(param, dtrain, num_round, evals=[(dtrain, 'train') , (dtest, 'test')], early_stopping_rounds=10)
    joblib.dump(bst, 'xgb_model_all_md.dat')
    bst.__del__()
    bst = joblib.load('xgb_model_all_md.dat')
    y_pred_model_xgb_all_md = bst.predict(dtest)
    bst.__del__()
    gc.collect()
    print(y_pred_model_xgb_all_md)

    #Log
    dtrain = xgboost.DMatrix(X_train_log_md, label=y_train_log_md)
    dtest = xgboost.DMatrix(X_test_log_md, label=y_test_log_md)
    bst = xgboost.train(param, dtrain, num_round, evals=[(dtrain, 'train') , (dtest, 'test')], early_stopping_rounds=10)
    joblib.dump(bst, 'xgb_model_log_md.dat')
    bst.__del__()
    bst = joblib.load('xgb_model_log_md.dat')
    y_pred_model_xgb_log_md = bst.predict(dtest)
    bst.__del__()
    gc.collect()
    print(y_pred_model_xgb_log_md)

    #Slf
    X_train_slf_md[['WELL','FORMATION']] = X_train_slf_md[['WELL','FORMATION']].astype('category')
    X_test_slf_md[['WELL','FORMATION']] = X_test_slf_md[['WELL','FORMATION']].astype('category')
    dtrain = xgboost.DMatrix(X_train_slf_md, label=y_train_slf_md, enable_categorical=True)
    dtest = xgboost.DMatrix(X_test_slf_md, label=y_test_slf_md, enable_categorical=True)
    bst = xgboost.train(param, dtrain, num_round, evals=[(dtrain, 'train') , (dtest, 'test')], early_stopping_rounds=10)
    joblib.dump(bst, 'xgb_model_slf_md.dat')
    bst.__del__()
    bst = joblib.load('xgb_model_slf_md.dat')
    y_pred_model_xgb_slf_md = bst.predict(dtest)
    bst.__del__()
    gc.collect()
    print(y_pred_model_xgb_slf_md)

#### 3.3.3 Random Forest

##### 3.3.3.1 OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(enc.fit_transform(X_train_slf_md[['WELL', 'FORMATION']]).toarray())
X_train_slf_md_enc = X_train_slf_md.drop(['WELL', 'FORMATION'], axis=1)
X_train_slf_md_enc.reset_index(drop=True, inplace=True)
X_train_slf_md_enc = pd.merge(encoded, X_train_slf_md_enc, left_index=True, right_index=True)

enc = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(enc.fit_transform(X_test_slf_md[['WELL', 'FORMATION']]).toarray())
X_test_slf_md_enc = X_test_slf_md.drop(['WELL', 'FORMATION'], axis=1)
X_test_slf_md_enc.reset_index(drop=True, inplace=True)
X_test_slf_md_enc = pd.merge(encoded, X_test_slf_md_enc, left_index=True, right_index=True)

enc = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(enc.fit_transform(X_train_all_md[['WELL', 'FORMATION', 'GROUP']]).toarray())
X_train_all_md_enc = X_train_all_md.drop(['WELL', 'FORMATION', 'GROUP'], axis=1)
X_train_all_md_enc.reset_index(drop=True, inplace=True)
X_train_all_md_enc = pd.merge(encoded, X_train_all_md_enc, left_index=True, right_index=True)

enc = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(enc.fit_transform(X_test_all_md[['WELL', 'FORMATION', 'GROUP']]).toarray())
X_test_all_md_enc = X_test_all_md.drop(['WELL', 'FORMATION', 'GROUP'], axis=1)
X_test_all_md_enc.reset_index(drop=True, inplace=True)
X_test_all_md_enc = pd.merge(encoded, X_test_all_md_enc, left_index=True, right_index=True)

#####  3.3.3.2 Training

In [None]:
rf = model_rf.fit(X_train_org_md, y_train_org_md)
y_pred_model_rf_org_md = rf.predict(X_test_org_md)
joblib.dump(rf, 'rf_model_org_md.dat')

rf = model_rf.fit(X_train_all_md_enc, y_train_all_md)
y_pred_model_rf_all_md = rf.predict(X_test_all_md_enc)
joblib.dump(rf, 'rf_model_all_md.dat')

rf = model_rf.fit(X_train_log_md, y_train_log_md)
y_pred_model_rf_log_md = rf.predict(X_test_log_md)
joblib.dump(rf, 'rf_model_log_md.dat')

rf = model_rf.fit(X_train_slf_md_enc, y_train_slf_md)
y_pred_model_rf_slf_md = rf.predict(X_test_slf_md_enc)
joblib.dump(rf, 'rf_model_slf_md.dat')

### 3.4 Использование обученных моделей

#### 3.4.1 CatBoost

In [None]:
cst = CatBoostClassifier()
cst.load_model('cst_model_org_md.dat')
y_pred_model_cb_org_md = cst.predict(X_test_org_md)

cst = CatBoostClassifier()
cst.load_model('cst_model_all_md.dat')
y_pred_model_cb_all_md = cst.predict(X_test_all_md)

cst = CatBoostClassifier()
cst.load_model('cst_model_log_md.dat')
y_pred_model_cb_log_md = cst.predict(X_test_log_md)

cst = CatBoostClassifier()
cst.load_model('cst_model_slf_md.dat')
y_pred_model_cb_slf_md = cst.predict(X_test_slf_md)

#### 3.4.2 XGBoost

In [None]:
dtrain = xgboost.DMatrix(X_train_org_md, label=y_train_org_md)
dtest = xgboost.DMatrix(X_test_org_md, label=y_test_org_md)
bst = joblib.load('xgb_model_org_md.dat')
y_pred_model_xgb_org_md = bst.predict(dtest)

X_train_all_md[['WELL', 'GROUP', 'FORMATION']] = X_train_all_md[['WELL', 'GROUP', 'FORMATION']].astype('category')
X_test_all_md[['WELL', 'GROUP', 'FORMATION']] = X_test_all_md[['WELL', 'GROUP', 'FORMATION']].astype('category')
dtrain = xgboost.DMatrix(X_train_all_md, label=y_train_all_md, enable_categorical=True)
dtest = xgboost.DMatrix(X_test_all_md, label=y_test_all_md, enable_categorical=True)
bst = joblib.load('xgb_model_all_md.dat')
y_pred_model_xgb_all_md = bst.predict(dtest)

dtrain = xgboost.DMatrix(X_train_log_md, label=y_train_log_md)
dtest = xgboost.DMatrix(X_test_log_md, label=y_test_log_md)
bst = joblib.load('xgb_model_log_md.dat')
y_pred_model_xgb_log_md = bst.predict(dtest)

X_train_slf_md[['WELL','FORMATION']] = X_train_slf_md[['WELL','FORMATION']].astype('category')
X_test_slf_md[['WELL','FORMATION']] = X_test_slf_md[['WELL','FORMATION']].astype('category')
dtrain = xgboost.DMatrix(X_train_slf_md, label=y_train_slf_md, enable_categorical=True)
dtest = xgboost.DMatrix(X_test_slf_md, label=y_test_slf_md, enable_categorical=True)
bst = joblib.load('xgb_model_slf_md.dat')
y_pred_model_xgb_slf_md = bst.predict(dtest)

#### 3.4.3 Random Forest

##### 3.4.3.1 OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(enc.fit_transform(X_train_slf_md[['WELL', 'FORMATION']]).toarray())
X_train_slf_md_enc = X_train_slf_md.drop(['WELL', 'FORMATION'], axis=1)
X_train_slf_md_enc.reset_index(drop=True, inplace=True)
X_train_slf_md_enc = pd.merge(encoded, X_train_slf_md_enc, left_index=True, right_index=True)

enc = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(enc.fit_transform(X_test_slf_md[['WELL', 'FORMATION']]).toarray())
X_test_slf_md_enc = X_test_slf_md.drop(['WELL', 'FORMATION'], axis=1)
X_test_slf_md_enc.reset_index(drop=True, inplace=True)
X_test_slf_md_enc = pd.merge(encoded, X_test_slf_md_enc, left_index=True, right_index=True)

enc = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(enc.fit_transform(X_train_all_md[['WELL', 'FORMATION', 'GROUP']]).toarray())
X_train_all_md_enc = X_train_all_md.drop(['WELL', 'FORMATION', 'GROUP'], axis=1)
X_train_all_md_enc.reset_index(drop=True, inplace=True)
X_train_all_md_enc = pd.merge(encoded, X_train_all_md_enc, left_index=True, right_index=True)

enc = OneHotEncoder(handle_unknown='ignore')
encoded = pd.DataFrame(enc.fit_transform(X_test_all_md[['WELL', 'FORMATION', 'GROUP']]).toarray())
X_test_all_md_enc = X_test_all_md.drop(['WELL', 'FORMATION', 'GROUP'], axis=1)
X_test_all_md_enc.reset_index(drop=True, inplace=True)
X_test_all_md_enc = pd.merge(encoded, X_test_all_md_enc, left_index=True, right_index=True)

##### 3.4.3.2 Use trained model

In [None]:
rf = joblib.load('rf_model_org_md.dat')
y_pred_model_rf_org_md = rf.predict(X_test_org_md)

rf = joblib.load('rf_model_all_md.dat')
y_pred_model_rf_all_md = rf.predict(X_test_all_md_enc)

rf = joblib.load('rf_model_log_md.dat')
y_pred_model_rf_log_md = rf.predict(X_test_log_md)

rf = joblib.load('rf_model_slf_md.dat')
y_pred_model_rf_slf_md = rf.predict(X_test_slf_md_enc)

# 4. Результаты

### 4.1 Определение кастомной метрики

In [None]:
def score(y_true, y_pred):
    S = 0.0
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    for i in range(0, y_true.shape[0]):
        S -= A[y_true[i], y_pred[i]]
    return S/y_true.shape[0]

### 4.2 Подсчёт точности

In [None]:
print('org')
print(f'Result (Random Forest): {score(y_test_org_zr.values, y_pred_model_rf_org_zr)}')
print(f'Result (CatBoost) (logloss): {score(y_test_org_zr.values, y_pred_model_cb_org_zr)}')
print(f'Result (XGBoost) (logloss): {score(y_test_org_zr.values, y_pred_model_xgb_org_zr)}')

print('all')
print(f'Result (Random Forest): {score(y_test_all_zr.values, y_pred_model_rf_all_zr)}')
print(f'Result (CatBoost) (logloss): {score(y_test_all_zr.values, y_pred_model_cb_all_zr)}')
print(f'Result (XGBoost)(logloss): {score(y_test_all_zr.values, y_pred_model_xgb_all_zr)}')

print('log')
print(f'Result (Random Forest): {score(y_test_log_zr.values, y_pred_model_rf_log_zr)}')
print(f'Result (CatBoost) (logloss): {score(y_test_log_zr.values, y_pred_model_cb_log_zr)}')
print(f'Result (XGBoost) (logloss): {score(y_test_log_zr.values, y_pred_model_xgb_log_zr)}')

print('slf')
print(f'Result (Random Forest): {score(y_test_slf_zr.values, y_pred_model_rf_slf_zr)}')
print(f'Result (CatBoost) (logloss): {score(y_test_slf_zr.values, y_pred_model_cb_slf_zr)}')
print(f'Result (XGBoost) (logloss): {score(y_test_slf_zr.values, y_pred_model_xgb_slf_zr)}')

In [None]:
print('CatBoost')
print(f'Accuracy org: {sklearn.metrics.accuracy_score(y_test_org_zr.values, y_pred_model_cb_org_zr)}')
print(f'Accuracy all: {sklearn.metrics.accuracy_score(y_test_all_zr.values, y_pred_model_cb_all_zr)}')
print(f'Accuracy log: {sklearn.metrics.accuracy_score(y_test_log_zr.values, y_pred_model_cb_log_zr)}')
print(f'Accuracy slf: {sklearn.metrics.accuracy_score(y_test_slf_zr.values, y_pred_model_cb_slf_zr)}')
print('XGBooost')
print(f'Accuracy org: {sklearn.metrics.accuracy_score(y_test_org_zr.values, y_pred_model_xgb_org_zr)}')
print(f'Accuracy all: {sklearn.metrics.accuracy_score(y_test_all_zr.values, y_pred_model_xgb_all_zr)}')
print(f'Accuracy log: {sklearn.metrics.accuracy_score(y_test_log_zr.values, y_pred_model_xgb_log_zr)}')
print(f'Accuracy slf: {sklearn.metrics.accuracy_score(y_test_slf_zr.values, y_pred_model_xgb_slf_zr)}')