In [1]:
import pandas as pd
import numpy as np
from usfull_tools import load_DS
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split
pd.options.display.max_columns = None
%matplotlib inline

from set_vars import KAGGLE_PREFIX, debug_mode, KAGGLE_DIR, target_column, target_type, loss_function, custom_metric

train, test = load_DS(debug_mode, KAGGLE_DIR, KAGGLE_PREFIX, '_prepare.csv')
del test

X_train, X_test, y_train, y_test = train_test_split(train[train.columns.drop(target_column)], train[target_column], 
                                                    test_size=0.3, random_state=42)
X_train.shape

(1022, 11702)

## Column prefixes:
-  _nan__median, _nan__min, _nan__max, _nan__0 - nan replaced by median
-  _idxmax - nan replaced by most frequent
-  _isnull - is null flag
-  _dummie - dumnie for categorical column

In [None]:
iterations = np.round(10 + len(X_train.columns)/20)


if target_type=='binary':
    loss_function='CrossEntropy'
    custom_metric='Accuracy'
    model = CatBoostClassifier(random_seed = 42, iterations=iterations, depth=2, learning_rate=0.1, 
                               loss_function=loss_function, custom_metric=loss_function)
elif target_type=='interval':
    loss_function='RMSE'
    custom_metric='RMSE'
    model = CatBoostRegressor(random_seed = 42, iterations=iterations, depth=12, learning_rate=0.1, 
                              loss_function=loss_function, custom_metric=loss_function)
    
#https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list

# Для CatBoost требуется явно указывать категориальные переменные
i=0
cat_features = []
for column in X_train.columns:
    if X_train[column].dtype == 'object': cat_features.append(i)
    i +=1

model.fit(X_train, y_train, cat_features)

from sklearn.metrics import mean_absolute_error, accuracy_score

if custom_metric=='Accuracy':
    print("Accuracy: %.3f"
          % accuracy_score(model.predict(X_test), y_test))

if custom_metric=='RMSE':
    print("RMSE: %.3f"
          % mean_absolute_error(model.predict(X_test), y_test))


feature_importance = pd.DataFrame(list(zip(X_test.dtypes.index, 
                                           model.get_feature_importance(Pool(X_test, label=y_test, cat_features=cat_features)))),
                                    columns=['Feature','Score'])

feature_importance = feature_importance.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')



#TO DO: сделать отбор лучшего из нескольких корелирующих параметров



0:	learn: 181172.9793865	total: 5m 2s	remaining: 2d 1h 56m 19s
1:	learn: 166627.7677691	total: 15m 52s	remaining: 3d 6h 25m 10s
2:	learn: 152357.4687023	total: 15m 52s	remaining: 2d 4h 14m 10s
3:	learn: 139588.1993795	total: 18m 22s	remaining: 1d 21h 15m 23s
4:	learn: 128334.2020281	total: 23m 6s	remaining: 1d 21h 25m 58s
5:	learn: 117697.7258518	total: 23m 7s	remaining: 1d 13h 50m 49s
6:	learn: 108422.2014224	total: 25m 29s	remaining: 1d 11h 40m 45s
7:	learn: 99990.5290126	total: 25m 36s	remaining: 1d 7h 18m 47s
8:	learn: 92420.5131367	total: 27m 50s	remaining: 1d 6h 12m 43s
9:	learn: 85448.6429708	total: 36m 42s	remaining: 1d 11h 47m 15s
10:	learn: 79147.1908591	total: 36m 49s	remaining: 1d 8h 34m 40s
11:	learn: 73657.0558834	total: 45m 59s	remaining: 1d 13h 14m 4s
12:	learn: 68741.5559212	total: 56m 34s	remaining: 1d 18h 13m 1s
13:	learn: 64293.6518635	total: 1h 6m 5s	remaining: 1d 21h 43m 6s


In [None]:
#Keep features with > 1% normalize importance
fi = feature_importance[feature_importance.Score > 1]
print(len(X_train.columns), '->', len(fi.index), 'non zero important features:', np.round(fi.Score.sum(),1), '%')
fi.sort_values('Score', ascending = False)

In [7]:
fi.to_csv(KAGGLE_DIR + KAGGLE_PREFIX + '_important_columns.csv', index=False)