In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [8]:
from catboost import CatBoostRegressor, Pool, metrics, cv

In [5]:
RANDOM_SEED = 42
VERSION    = 16

In [6]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [2]:
data = pd.read_csv('data_X2.csv')

In [16]:
X = data.query('sample == 1').drop(['sample'], axis=1)
X_sub = data.query('sample == 0').drop(['sample', 'price'], axis=1)

y = X.price

X = X.drop('price', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RANDOM_SEED)

In [9]:
def catboost_predict(X_train, X_test, X_sub, y_train):
    model = CatBoostRegressor(iterations = 400,
                              random_seed = RANDOM_SEED,
                              eval_metric='MAPE',
                              custom_metric=['R2', 'MAE'],
                              silent=True,
                             )
    model.fit(X_train, np.log(y_train),
             #cat_features=cat_features_ids,
             eval_set=(X_test, np.log(y_test)),
             verbose_eval=0,
             use_best_model=True,
             #plot=True
             )

    #model.save_model('catboost_single_model_2_baseline.model')

    predict_test = np.exp(model.predict(X_test))
    predict_submission = np.exp(model.predict(X_sub))

    print(f"Точность модели по метрике MAPE: {(mape(y_test, predict_test))*100:0.2f}%")
    return predict_submission

In [None]:
%%time
# exhaustive search

# in order to shorter search time for the demonstration
# i will ask the algorithm to try all possible 10 and 11
# feature combinations

# if you have access to a multicore or distributed computer
# system you can try more greedy searches

efs = EFS(RandomForestRegressor(n_estimators=5,
                                n_jobs=-1,
                                random_state=0,
                                max_depth=2),
          min_features=1,
          max_features=5,
          scoring='neg_mean_absolute_percentage_error',
          print_progress=True,
          cv=2)

efs = efs.fit(np.array(X_train), y_train)



In [12]:
sel_cols = X_train.columns[list(efs.best_idx_)]

In [13]:
sel_cols

Index(['Полная масса, кг', 'mean_age'], dtype='object')

In [14]:
X_train = X_train[sel_cols]
X_test = X_test[sel_cols]
X_sub = X_sub[sel_cols]

In [15]:
%%time
predict_submission = catboost_predict(X_train, X_test, X_sub, y_train)

Точность модели по метрике MAPE: 22.95%
Wall time: 3.75 s
