In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from catboost import CatBoostClassifier
import xgboost
import joblib
from joblib import dump, load
import gc
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics
from sklearn.metrics import accuracy_score

In [None]:
train_flag = True #Если нужно тренировать модели, то флаг на 'True', если загрузить тренированные, то 'False'

# 1. Смотрим на данные

In [None]:
data = pd.read_csv(r"D:\for ML\HW-hack\train_hw.csv", sep=';')

In [None]:
data.info()

Есть численные и категориальные признаки

In [None]:
data.isna().sum()

Много пропущенных значений для некоторых признаков

In [None]:
for i, w in enumerate(data['WELL'].unique()):
    if i > 5:
        break
    well_data = data[data['WELL'] == w]
    fig, axs = plt.subplots(1, 5, figsize=(8, 10), sharey=True)
    for ic, col in enumerate(set(['CALI', 'RDEP', 'GR', 'PEF', 'SP'])): # enumerate() добавляет счётчик, превращая объект в кортеж.
                                                                                  # Например, for e in enumerate(elements). Здесь вместо elements будет кортеж (counter, element)
                                                                                  # По умолчанию отсчёт начинается с 0, но можно передать вторым параметром число, с которого будет начинаться отсчёт
                                                                                  # Метод ".set()" преобразует объект в множество {} (запись аналогична словарю). Множество похоже на список, только упорядочено и не содержит дубликатов
        axs[ic].plot(well_data[col], well_data['DEPTH_MD'], color="#000000")
        axs[ic].set_xlabel(col)
    axs[0].set_ylim(well_data['DEPTH_MD'].values[-1], well_data['DEPTH_MD'].values[0]) # обрезаем ось Y для первого холста. Т.к. оси для всех холстов связаны, то изменяются они изменяются сразу для всех холстов

Вид каротажных кривых для нескольких скважин:
1. Могут быть пропущены значения как части кривой, так и её целиком
2. Имеются сильные выбросы (это необязательно вредные аномалии)

In [None]:
len(data['WELL'].unique())

In [None]:
data.groupby('WELL').mean().isna().sum()

Для многих скважин полностью отсутствуют некоторые признаки

In [None]:
lithology_percent = np.zeros(len(data['FORCE_2020_LITHOFACIES_LITHOLOGY'].unique()))
for c, l in enumerate(data['FORCE_2020_LITHOFACIES_LITHOLOGY'].unique()):
    for i in data['FORCE_2020_LITHOFACIES_LITHOLOGY']:
        if l == i:
            lithology_percent[c] += 1
lithology_percent = np.round(((lithology_percent/len(data))*100), 2)
lithology_percent = dict(zip(data['FORCE_2020_LITHOFACIES_LITHOLOGY'].unique(), lithology_percent))
lithology = dict(zip([30000, 65030, 65000, 80000, 74000, 70000, 70032, 88000, 86000, 99000, 90000, 93000], 
           ['Sandstone', 'Sandstone/Shale', 'Shale', 'Marl','Dolomite', 'Limestone', 'Chalk', 'Halite', 'Anhydrite',
            'Tuff', 'Coal', 'Basement']))
lithology_name_percent = {}
for i in data['FORCE_2020_LITHOFACIES_LITHOLOGY'].unique():
    lithology_name_percent[lithology[i]] = lithology_percent[i]

In [None]:
ax = plt.subplots(figsize=(20,10))
sns.barplot(list(lithology_name_percent.keys()), list(lithology_name_percent.values()), color='blue')
plt.ylabel("Процент литологического класса, %")

In [None]:
data_f_l = data['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(lithology)
data_copy = pd.merge(data['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(lithology), data['FORCE_2020_LITHOFACIES_CONFIDENCE'], left_index=True, right_index=True)
sns.displot(data_copy, x='FORCE_2020_LITHOFACIES_LITHOLOGY', hue='FORCE_2020_LITHOFACIES_CONFIDENCE', height=9, aspect=2)

Присутствует сильный дисбаланс классов

In [None]:
ax = plt.subplots(figsize=(20,15))
sns.heatmap(data.drop(['WELL', 'GROUP', 'FORMATION'], axis=1).corr(), annot = True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', fmt='.1g')

1. Нет сильной корреляции признаков с таргетом (нет обязательных к присутствию признаков)
2. Многие признаки коррелируются между собой

# 2. Обработка данных

### 2.1 Выбор признаков

In [None]:
data_org = data[['WELL', 'DEPTH_MD', 'CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE']]
data_all = data
data_log = data.drop(['DEPTH_MD','X_LOC','Y_LOC','Z_LOC','GROUP','FORMATION','MUDWEIGHT'], axis=1)
data_slf = data[['WELL', 'FORMATION', 'DEPTH_MD', 'DRHO','DTC', 'SP', 'GR', 'RDEP', 'FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE']]

### 2.2 Восстановление (импутация) значений

In [None]:
data_org_fill_zr = data_org.fillna(0, inplace=False)
values = {'GROUP': data['GROUP'].mode().iloc[0], 'FORMATION': data['FORMATION'].mode().iloc[0]}
data_all_fill_zr = data_all.fillna(value=values)
data_all_fill_zr = data_all_fill_zr.fillna(0, inplace=False)
data_log_fill_zr = data_log.fillna(0, inplace=False)
data_slf_fill_zr = data_slf.fillna(value=values)
data_slf_fill_zr = data_slf_fill_zr.fillna(0, inplace=False)

### 2.3 Подготовка данных к обучению

In [None]:
scaler = StandardScaler()
data_org_fill_zr[['CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR','NPHI', 'PEF', 'DTC', 'SP', 'BS']] = scaler.fit_transform(data_org_fill_zr[['CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR','NPHI', 'PEF', 'DTC', 'SP', 'BS']])
data_all_fill_zr[['X_LOC', 'Y_LOC', 'Z_LOC','CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI', 'PEF',
       'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'MUDWEIGHT', 'RMIC',
       'ROPA', 'RXO']] = scaler.fit_transform(data_all_fill_zr[['X_LOC', 'Y_LOC', 'Z_LOC','CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI', 'PEF',
       'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'MUDWEIGHT', 'RMIC',
       'ROPA', 'RXO']])
data_log_fill_zr[['CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI',
       'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'RMIC', 'ROPA',
       'RXO']] = scaler.fit_transform(data_log_fill_zr[['CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI',
       'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'RMIC', 'ROPA',
       'RXO']])
data_slf_fill_zr[['DRHO', 'DTC', 'SP', 'GR', 'RDEP']] = scaler.fit_transform(data_slf_fill_zr[['DRHO', 'DTC', 'SP', 'GR', 'RDEP']])

#### 2.3.3 Разбиение  на фичи и таргет + работа с таргетом

In [None]:
X_org_zr = data_org_fill_zr.drop(['WELL', 'FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)
X_all_zr = data_all_fill_zr.drop(['FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)
X_log_zr = data_log_fill_zr.drop(['WELL', 'FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)
X_slf_zr = data_slf_fill_zr.drop(['FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)

y_org_zr = data_org_fill_zr['FORCE_2020_LITHOFACIES_LITHOLOGY']
y_all_zr = data_all_fill_zr['FORCE_2020_LITHOFACIES_LITHOLOGY']
y_log_zr = data_log_fill_zr['FORCE_2020_LITHOFACIES_LITHOLOGY']
y_slf_zr = data_slf_fill_zr['FORCE_2020_LITHOFACIES_LITHOLOGY']

In [None]:
lithology_numbers = {30000: 0,
                 65030: 1,
                 65000: 2,
                 80000: 3,
                 74000: 4,
                 70000: 5,
                 70032: 6,
                 88000: 7,
                 86000: 8,
                 99000: 9,
                 90000: 10,
                 93000: 11}

In [None]:
y_org_zr = y_org_zr.map(lithology_numbers)
y_all_zr = y_all_zr.map(lithology_numbers)
y_log_zr = y_log_zr.map(lithology_numbers)
y_slf_zr = y_slf_zr.map(lithology_numbers)

#### 2.3.4 Разбиение на тренировочный и тестовый сеты

In [None]:
X_train_org_zr, X_test_org_zr, y_train_org_zr, y_test_org_zr = train_test_split(X_org_zr, y_org_zr, train_size=0.70, random_state=42, stratify=y_org_zr)
X_train_all_zr, X_test_all_zr, y_train_all_zr, y_test_all_zr = train_test_split(X_all_zr, y_all_zr, train_size=0.70, random_state=42, stratify=y_all_zr)
X_train_log_zr, X_test_log_zr, y_train_log_zr, y_test_log_zr = train_test_split(X_log_zr, y_log_zr, train_size=0.70, random_state=42, stratify=y_log_zr)
X_train_slf_zr, X_test_slf_zr, y_train_slf_zr, y_test_slf_zr = train_test_split(X_slf_zr, y_slf_zr, train_size=0.70, random_state=42, stratify=y_slf_zr)

# 3. Обучение

### 3.1 Настройка моделей

In [None]:
if train_flag == True:
    model_rf = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced', n_jobs=8)

In [None]:
if train_flag == True:
    model_cb = CatBoostClassifier(loss_function='MultiClass', task_type='GPU', random_seed=42)

### 3.2 Обучение моделей

#### 3.2.1 CatBoost

In [None]:
if train_flag == True:
    cst = model_cb.fit(X_train_org_zr, y_train_org_zr, verbose=False, plot=True)
    cst.save_model('cst_model_org_zr.dat')
    y_pred_model_cb_org_zr = cst.predict(X_test_org_zr)

    cat_features = ['WELL', 'GROUP', 'FORMATION']
    cst = model_cb.fit(X_train_all_zr, y_train_all_zr, verbose=False, plot=True, cat_features=cat_features)
    cst.save_model('cst_model_all_zr.dat')
    y_pred_model_cb_all_zr = cst.predict(X_test_all_zr)
    cst.get_feature_importance()

    cst = model_cb.fit(X_train_log_zr, y_train_log_zr, verbose=False, plot=True)
    cst.save_model('cst_model_log_zr.dat')
    y_pred_model_cb_log_zr = cst.predict(X_test_log_zr)

    cat_features = ['WELL', 'FORMATION']
    cst = model_cb.fit(X_train_slf_zr, y_train_slf_zr, verbose=False, plot=True, cat_features=cat_features)
    cst.save_model('cst_model_slf_zr.dat')
    y_pred_model_cb_slf_zr = cst.predict(X_test_slf_zr)

#### 3.2.2 XGBoost

In [None]:
if train_flag == True:
    param = {
        'tree_method': 'gpu_hist',
        'num_class': 12,
        'verbosity': 0,
        'objective': 'multi:softmax',
        'eval_metric': 'mlogloss',
        'n_estimators': 300}
    #Org
    dtrain = xgboost.DMatrix(X_train_org_zr, label=y_train_org_zr)
    dtest = xgboost.DMatrix(X_test_org_zr, label=y_test_org_zr)
    bst = xgboost.train(param, dtrain)
    joblib.dump(bst, 'xgb_model_org_zr.dat')
    bst.__del__() # удаление модели с её взаимосвязями для очистки графической памяти
    bst = joblib.load('xgb_model_org_zr.dat')
    y_pred_model_xgb_org_zr = bst.predict(dtest)
    bst.__del__() # удаление модели с её взаимосвязями для очистки графической памяти
    gc.collect()
    print(y_pred_model_xgb_org_zr)

    #All
    X_train_all_zr[['WELL', 'GROUP', 'FORMATION']] = X_train_all_zr[['WELL', 'GROUP', 'FORMATION']].astype('category')
    X_test_all_zr[['WELL', 'GROUP', 'FORMATION']] = X_test_all_zr[['WELL', 'GROUP', 'FORMATION']].astype('category')
    dtrain = xgboost.DMatrix(X_train_all_zr, label=y_train_all_zr, enable_categorical=True)
    dtest = xgboost.DMatrix(X_test_all_zr, label=y_test_all_zr, enable_categorical=True)
    bst = xgboost.train(param, dtrain)
    joblib.dump(bst, 'xgb_model_all_zr.dat')
    bst.__del__()
    bst = joblib.load('xgb_model_all_zr.dat')
    y_pred_model_xgb_all_zr = bst.predict(dtest)
    bst.__del__()
    gc.collect()
    print(y_pred_model_xgb_all_zr)

    #Log
    dtrain = xgboost.DMatrix(X_train_log_zr, label=y_train_log_zr)
    dtest = xgboost.DMatrix(X_test_log_zr, label=y_test_log_zr)
    bst = xgboost.train(param, dtrain)
    joblib.dump(bst, 'xgb_model_log_zr.dat')
    bst.__del__()
    bst = joblib.load('xgb_model_log_zr.dat')
    y_pred_model_xgb_log_zr = bst.predict(dtest)
    bst.__del__()
    gc.collect()
    print(y_pred_model_xgb_log_zr)

    #Slf
    X_train_slf_zr[['WELL','FORMATION']] = X_train_slf_zr[['WELL','FORMATION']].astype('category')
    X_test_slf_zr[['WELL','FORMATION']] = X_test_slf_zr[['WELL','FORMATION']].astype('category')
    dtrain = xgboost.DMatrix(X_train_slf_zr, label=y_train_slf_zr, enable_categorical=True)
    dtest = xgboost.DMatrix(X_test_slf_zr, label=y_test_slf_zr, enable_categorical=True)
    bst = xgboost.train(param, dtrain)
    joblib.dump(bst, 'xgb_model_slf_zr.dat')
    bst.__del__()
    bst = joblib.load('xgb_model_slf_zr.dat')
    y_pred_model_xgb_slf_zr = bst.predict(dtest)
    bst.__del__()
    gc.collect()
    print(y_pred_model_xgb_slf_zr)

#### 3.2.3 Random Forest

##### 3.2.3.1 OneHotEncoder

In [None]:
if train_flag == True:
    enc = OneHotEncoder(handle_unknown='ignore')
    encoded = pd.DataFrame(enc.fit_transform(X_train_slf_zr[['WELL', 'FORMATION']]).toarray())
    X_train_slf_zr_enc = X_train_slf_zr.drop(['WELL', 'FORMATION'], axis=1)
    X_train_slf_zr_enc.reset_index(drop=True, inplace=True)
    X_train_slf_zr_enc = pd.merge(encoded, X_train_slf_zr_enc, left_index=True, right_index=True)

    enc = OneHotEncoder(handle_unknown='ignore')
    encoded = pd.DataFrame(enc.fit_transform(X_test_slf_zr[['WELL', 'FORMATION']]).toarray())
    X_test_slf_zr_enc = X_test_slf_zr.drop(['WELL', 'FORMATION'], axis=1)
    X_test_slf_zr_enc.reset_index(drop=True, inplace=True)
    X_test_slf_zr_enc = pd.merge(encoded, X_test_slf_zr_enc, left_index=True, right_index=True)

    enc = OneHotEncoder(handle_unknown='ignore')
    encoded = pd.DataFrame(enc.fit_transform(X_train_all_zr[['WELL', 'FORMATION', 'GROUP']]).toarray())
    X_train_all_zr_enc = X_train_all_zr.drop(['WELL', 'FORMATION', 'GROUP'], axis=1)
    X_train_all_zr_enc.reset_index(drop=True, inplace=True)
    X_train_all_zr_enc = pd.merge(encoded, X_train_all_zr_enc, left_index=True, right_index=True)

    enc = OneHotEncoder(handle_unknown='ignore')
    encoded = pd.DataFrame(enc.fit_transform(X_test_all_zr[['WELL', 'FORMATION', 'GROUP']]).toarray())
    X_test_all_zr_enc = X_test_all_zr.drop(['WELL', 'FORMATION', 'GROUP'], axis=1)
    X_test_all_zr_enc.reset_index(drop=True, inplace=True)
    X_test_all_zr_enc = pd.merge(encoded, X_test_all_zr_enc, left_index=True, right_index=True)

#####  3.2.3.2 Training

In [None]:
if train_flag == True:
    rf = model_rf.fit(X_train_org_zr, y_train_org_zr)
    y_pred_model_rf_org_zr = rf.predict(X_test_org_zr)
    joblib.dump(rf, 'rf_model_org_zr.dat')

    rf = model_rf.fit(X_train_all_zr_enc, y_train_all_zr)
    y_pred_model_rf_all_zr = rf.predict(X_test_all_zr_enc)
    joblib.dump(rf, 'rf_model_all_zr.dat')

    rf = model_rf.fit(X_train_log_zr, y_train_log_zr)
    y_pred_model_rf_log_zr = rf.predict(X_test_log_zr)
    joblib.dump(rf, 'rf_model_log_zr.dat')

    rf = model_rf.fit(X_train_slf_zr_enc, y_train_slf_zr)
    y_pred_model_rf_slf_zr = rf.predict(X_test_slf_zr_enc)
    joblib.dump(rf, 'rf_model_slf_zr.dat')

### 3.3 Использование обученных моделей

#### 3.3.1 CatBoost

In [None]:
if train_flag == False:
    cst = CatBoostClassifier()
    cst.load_model('cst_model_org_zr.dat')
    y_pred_model_cb_org_zr = cst.predict(X_test_org_zr)

    cst = CatBoostClassifier()
    cst.load_model('cst_model_all_zr.dat')
    y_pred_model_cb_all_zr = cst.predict(X_test_all_zr)

    cst = CatBoostClassifier()
    cst.load_model('cst_model_log_zr.dat')
    y_pred_model_cb_log_zr = cst.predict(X_test_log_zr)

    cst = CatBoostClassifier()
    cst.load_model('cst_model_slf_zr.dat')
    y_pred_model_cb_slf_zr = cst.predict(X_test_slf_zr)

#### 3.3.2 XGBoost

In [None]:
if train_flag == False:
    dtrain = xgboost.DMatrix(X_train_org_zr, label=y_train_org_zr)
    dtest = xgboost.DMatrix(X_test_org_zr, label=y_test_org_zr)
    bst = joblib.load('xgb_model_org_zr.dat')
    y_pred_model_xgb_org_zr = bst.predict(dtest)

    X_train_all_zr[['WELL', 'GROUP', 'FORMATION']] = X_train_all_zr[['WELL', 'GROUP', 'FORMATION']].astype('category')
    X_test_all_zr[['WELL', 'GROUP', 'FORMATION']] = X_test_all_zr[['WELL', 'GROUP', 'FORMATION']].astype('category')
    dtrain = xgboost.DMatrix(X_train_all_zr, label=y_train_all_zr, enable_categorical=True)
    dtest = xgboost.DMatrix(X_test_all_zr, label=y_test_all_zr, enable_categorical=True)
    bst = joblib.load('xgb_model_all_zr.dat')
    y_pred_model_xgb_all_zr = bst.predict(dtest)

    dtrain = xgboost.DMatrix(X_train_log_zr, label=y_train_log_zr)
    dtest = xgboost.DMatrix(X_test_log_zr, label=y_test_log_zr)
    bst = joblib.load('xgb_model_log_zr.dat')
    y_pred_model_xgb_log_zr = bst.predict(dtest)

    X_train_slf_zr[['WELL','FORMATION']] = X_train_slf_zr[['WELL','FORMATION']].astype('category')
    X_test_slf_zr[['WELL','FORMATION']] = X_test_slf_zr[['WELL','FORMATION']].astype('category')
    dtrain = xgboost.DMatrix(X_train_slf_zr, label=y_train_slf_zr, enable_categorical=True)
    dtest = xgboost.DMatrix(X_test_slf_zr, label=y_test_slf_zr, enable_categorical=True)
    bst = joblib.load('xgb_model_slf_zr.dat')
    y_pred_model_xgb_slf_zr = bst.predict(dtest)

#### 3.3.3 Random Forest

##### 3.3.3.1 OneHotEncoder

In [None]:
if train_flag == False:
    enc = OneHotEncoder(handle_unknown='ignore')
    encoded = pd.DataFrame(enc.fit_transform(X_train_slf_zr[['WELL', 'FORMATION']]).toarray())
    X_train_slf_zr_enc = X_train_slf_zr.drop(['WELL', 'FORMATION'], axis=1)
    X_train_slf_zr_enc.reset_index(drop=True, inplace=True)
    X_train_slf_zr_enc = pd.merge(encoded, X_train_slf_zr_enc, left_index=True, right_index=True)

    enc = OneHotEncoder(handle_unknown='ignore')
    encoded = pd.DataFrame(enc.fit_transform(X_test_slf_zr[['WELL', 'FORMATION']]).toarray())
    X_test_slf_zr_enc = X_test_slf_zr.drop(['WELL', 'FORMATION'], axis=1)
    X_test_slf_zr_enc.reset_index(drop=True, inplace=True)
    X_test_slf_zr_enc = pd.merge(encoded, X_test_slf_zr_enc, left_index=True, right_index=True)

##### 3.3.3.2 Use trained model

In [None]:
if train_flag == False:
    rf = joblib.load('rf_model_org_zr.dat')
    y_pred_model_rf_org_zr = rf.predict(X_test_org_zr)

    rf = joblib.load('rf_model_all_zr.dat')
    y_pred_model_rf_all_zr = rf.predict(X_test_all_zr_enc)

    rf = joblib.load('rf_model_log_zr.dat')
    y_pred_model_rf_log_zr = rf.predict(X_test_log_zr)

    rf = joblib.load('rf_model_slf_zr.dat')
    y_pred_model_rf_slf_zr = rf.predict(X_test_slf_zr_enc)

# 4. Результаты

### 4.1 Определение кастомной метрики

In [None]:
A=np.array([[0.    , 2.    , 3.5   , 3.    , 3.75  , 3.5   , 3.5   , 4.    , 4.    , 2.5   , 3.875 , 3.25  ],
            [2.    , 0.    , 2.375 , 2.75  , 4.    , 3.75  , 3.75  , 3.875 , 4.    , 3.    , 3.75  , 3.    ],
            [3.5   , 2.375 , 0.    , 2.    , 3.5   , 3.5   , 3.75  , 4.    , 4.    , 2.75  , 3.25  , 3.    ],
            [3.    , 2.75  , 2.    , 0.    , 2.5   , 2.    , 2.25  , 4.    , 4.    , 3.375 , 3.75  , 3.25  ],
            [3.75  , 4.    , 3.5   , 2.5   , 0.    , 2.625 , 2.875 , 3.75  , 3.25  , 3.    , 4.    , 3.625 ],
            [3.5   , 3.75  , 3.5   , 2.    , 2.625 , 0.    , 1.375 , 4.    , 3.75  , 3.5   , 4.    , 3.625 ],
            [3.5   , 3.75  , 3.75  , 2.25  , 2.875 , 1.375 , 0.    , 4.    , 3.75  , 3.125 , 4.    , 3.75  ],
            [4.    , 3.875 , 4.    , 4.    , 3.75  , 4.    , 4.    , 0.    , 2.75  , 3.75  , 3.75  , 4.    ],
            [4.    , 4.    , 4.    , 4.    , 3.25  , 3.75  , 3.75  , 2.75  , 0.    , 4.    , 4.    , 3.875 ],
            [2.5   , 3.    , 2.75  , 3.375 , 3.    , 3.5   , 3.125 , 3.75  , 4.    , 0.    , 2.5   , 3.25  ],
            [3.875 , 3.75  , 3.25  , 3.75  , 4.    , 4.    , 4.    , 3.75  , 4.    , 2.5   , 0.    , 4.    ],
            [3.25  , 3.    , 3.    , 3.25  , 3.625 , 3.625 , 3.75  , 4.    , 3.875 , 3.25  , 4.    , 0.    ]])

In [None]:
def score(y_true, y_pred):
    S = 0.0
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    for i in range(0, y_true.shape[0]):
        S -= A[y_true[i], y_pred[i]]
    return S/y_true.shape[0]

### 4.2 Подсчёт точности

In [None]:
print('org')
print(f'Result (Random Forest): {score(y_test_org_zr.values, y_pred_model_rf_org_zr)}')
print(f'Result (CatBoost): {score(y_test_org_zr.values, y_pred_model_cb_org_zr)}')
print(f'Result (XGBoost): {score(y_test_org_zr.values, y_pred_model_xgb_org_zr)}')

print('all')
print(f'Result (Random Forest): {score(y_test_all_zr.values, y_pred_model_rf_all_zr)}')
print(f'Result (CatBoost): {score(y_test_all_zr.values, y_pred_model_cb_all_zr)}')
print(f'Result (XGBoost): {score(y_test_all_zr.values, y_pred_model_xgb_all_zr)}')

print('log')
print(f'Result (Random Forest): {score(y_test_log_zr.values, y_pred_model_rf_log_zr)}')
print(f'Result (CatBoost): {score(y_test_log_zr.values, y_pred_model_cb_log_zr)}')
print(f'Result (XGBoost): {score(y_test_log_zr.values, y_pred_model_xgb_log_zr)}')

print('slf')
print(f'Result (Random Forest): {score(y_test_slf_zr.values, y_pred_model_rf_slf_zr)}')
print(f'Result (CatBoost): {score(y_test_slf_zr.values, y_pred_model_cb_slf_zr)}')
print(f'Result (XGBoost): {score(y_test_slf_zr.values, y_pred_model_xgb_slf_zr)}')

In [None]:
print('CatBoost')
print(f'Accuracy org: {sklearn.metrics.accuracy_score(y_test_org_zr.values, y_pred_model_cb_org_zr)}')
print(f'Accuracy all: {sklearn.metrics.accuracy_score(y_test_all_zr.values, y_pred_model_cb_all_zr)}')
print(f'Accuracy log: {sklearn.metrics.accuracy_score(y_test_log_zr.values, y_pred_model_cb_log_zr)}')
print(f'Accuracy slf: {sklearn.metrics.accuracy_score(y_test_slf_zr.values, y_pred_model_cb_slf_zr)}')
print('XGBooost')
print(f'Accuracy org: {sklearn.metrics.accuracy_score(y_test_org_zr.values, y_pred_model_xgb_org_zr)}')
print(f'Accuracy all: {sklearn.metrics.accuracy_score(y_test_all_zr.values, y_pred_model_xgb_all_zr)}')
print(f'Accuracy log: {sklearn.metrics.accuracy_score(y_test_log_zr.values, y_pred_model_xgb_log_zr)}')
print(f'Accuracy slf: {sklearn.metrics.accuracy_score(y_test_slf_zr.values, y_pred_model_xgb_slf_zr)}')