In [1]:
import pandas as pd
import numpy as np
from usfull_tools import load_DS
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split
pd.options.display.max_columns = None
%matplotlib inline

from set_vars import KAGGLE_PREFIX, debug_mode, KAGGLE_DIR, target_column, target_type, loss_function, custom_metric

train, test = load_DS(debug_mode, KAGGLE_DIR, KAGGLE_PREFIX, '_prepare.csv')
del test

X_train, X_test, y_train, y_test = train_test_split(train[train.columns.drop(target_column)], train[target_column], 
                                                    test_size=0.3, random_state=42)
X_train.shape

(1022, 1853)

## Column prefixes:
-  _nan__median, _nan__min, _nan__max, _nan__0 - nan replaced by median
-  _idxmax - nan replaced by most frequent
-  _isnull - is null flag
-  _dummie - dumnie for categorical column

In [None]:
iterations = np.round(10 + len(X_train.columns)/20)
print('iterations :', iterations)

if target_type=='binary':
    loss_function='CrossEntropy'
    custom_metric='Accuracy'
    model = CatBoostClassifier(random_seed = 42, iterations=iterations, depth=2, learning_rate=0.1, 
                               loss_function=loss_function, custom_metric=loss_function, od_type = 'Iter')
elif target_type=='interval':
    loss_function='RMSE'
    custom_metric='RMSE'
    model = CatBoostRegressor(random_seed = 42, iterations=iterations, depth=12, learning_rate=0.1, 
                              loss_function=loss_function, custom_metric=loss_function, od_type = 'Iter')
    
#https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list

# Для CatBoost требуется явно указывать категориальные переменные
i=0
cat_features = []
for column in X_train.columns:
    if X_train[column].dtype == 'object': cat_features.append(i)
    i +=1

model.fit(X_train, y_train, cat_features)

from sklearn.metrics import mean_absolute_error, accuracy_score

if custom_metric=='Accuracy':
    print("Accuracy: %.3f"
          % accuracy_score(model.predict(X_test), y_test))

if custom_metric=='RMSE':
    print("RMSE: %.3f"
          % mean_absolute_error(model.predict(X_test), y_test))


feature_importance = pd.DataFrame(list(zip(X_test.dtypes.index, 
                                           model.get_feature_importance(Pool(X_test, label=y_test, cat_features=cat_features)))),
                                    columns=['Feature','Score'])

feature_importance = feature_importance.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')

#TO DO: сделать отбор лучшего из нескольких корелирующих параметров

iterations : 103.0




0:	learn: 180439.5976178	total: 1m 42s	remaining: 2h 54m 51s
1:	learn: 165536.1864331	total: 3m 25s	remaining: 2h 52m 32s
2:	learn: 152182.0767863	total: 5m 4s	remaining: 2h 49m 17s
3:	learn: 139274.7457142	total: 5m 17s	remaining: 2h 10m 48s
4:	learn: 127661.1173537	total: 5m 29s	remaining: 1h 47m 32s
5:	learn: 117464.7138453	total: 7m 15s	remaining: 1h 57m 17s
6:	learn: 108236.9276810	total: 9m 14s	remaining: 2h 6m 47s
7:	learn: 100288.4996062	total: 11m 12s	remaining: 2h 13m
8:	learn: 92426.0671813	total: 11m 15s	remaining: 1h 57m 33s
9:	learn: 85589.4288257	total: 11m 28s	remaining: 1h 46m 46s
10:	learn: 79987.3159165	total: 13m 36s	remaining: 1h 53m 48s


In [None]:
#Keep features with > 1% normalize importance
fi = feature_importance[feature_importance.Score > 1]
print(len(X_train.columns), '->', len(fi.index), 'non zero important features:', np.round(fi.Score.sum(),1), '%')
fi.sort_values('Score', ascending = False)

In [None]:
fi.to_csv(KAGGLE_DIR + KAGGLE_PREFIX + '_important_columns.csv', index=False)