In [2]:
import pandas as pd
import numpy as np
from usfull_tools import load_DS
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split
pd.options.display.max_columns = None
%matplotlib inline

from set_vars import KAGGLE_PREFIX, debug_mode, KAGGLE_DIR, target_column, target_type

train, test = load_DS(debug_mode, KAGGLE_DIR, KAGGLE_PREFIX, '_prepare.csv')
del test

X_train, X_test, y_train, y_test = train_test_split(train[train.columns.drop(target_column)], train[target_column], 
                                                    test_size=0.3, random_state=42)
X_train.shape

(623, 449)

## Column prefixes:
-  _nan - original columns with Nan
-  _nan_median, _nan_min, _nan_max, _nan0 - nan replaced by median
-  _idxmax - nan replaced by most frequent
-  _isnull - is null flag
-  _dummie - dumnie for categorical column

In [3]:
iterations = np.round(10 + len(X_train.columns)/20)
if target_type=='binary':
    model = CatBoostClassifier(random_seed = 42, iterations=iterations, depth=2, learning_rate=0.1, 
                               loss_function='CrossEntropy', eval_metric='Accuracy')
    params = ''
elif target_type=='interval':
    model = CatBoostRegressor()
    params = ''
    
#https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list

# Для CatBoost требуется явно указывать категориальные переменные
i=0
cat_features = []
for column in X_train.columns:
    if X_train[column].dtype == 'object': cat_features.append(i)
    i +=1

model.fit(X_train, y_train, cat_features)

from sklearn.metrics import mean_absolute_error, accuracy_score
print("accuracy: %.3f"
      % accuracy_score(model.predict(X_test), y_test))


feature_importance = pd.DataFrame(list(zip(X_test.dtypes.index, 
                                           model.get_feature_importance(Pool(X_test, label=y_test, cat_features=cat_features)))),
                                    columns=['Feature','Score'])

feature_importance = feature_importance.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')



#TO DO: сделать отбор лучшего из корелирующих параметров

0:	learn: 0.7784912	total: 192ms	remaining: 5.95s
1:	learn: 0.7656501	total: 218ms	remaining: 3.27s
2:	learn: 0.7656501	total: 245ms	remaining: 2.37s
3:	learn: 0.7977528	total: 271ms	remaining: 1.9s
4:	learn: 0.7865169	total: 298ms	remaining: 1.61s
5:	learn: 0.7849117	total: 325ms	remaining: 1.41s
6:	learn: 0.7977528	total: 367ms	remaining: 1.31s
7:	learn: 0.7977528	total: 421ms	remaining: 1.26s
8:	learn: 0.8057785	total: 456ms	remaining: 1.16s
9:	learn: 0.8073836	total: 483ms	remaining: 1.06s
10:	learn: 0.8138042	total: 507ms	remaining: 969ms
11:	learn: 0.8089888	total: 533ms	remaining: 888ms
12:	learn: 0.8154093	total: 559ms	remaining: 817ms
13:	learn: 0.8154093	total: 586ms	remaining: 753ms
14:	learn: 0.8138042	total: 633ms	remaining: 718ms
15:	learn: 0.8186196	total: 676ms	remaining: 676ms
16:	learn: 0.8218299	total: 715ms	remaining: 631ms
17:	learn: 0.8298555	total: 748ms	remaining: 582ms
18:	learn: 0.8282504	total: 781ms	remaining: 535ms
19:	learn: 0.8314607	total: 809ms	remainin

In [6]:
#Keep features with > 1% normalize importance
fi = feature_importance[feature_importance.Score > 1]
print(len(X_train.columns), '->', len(fi.index), 'non zero important features:', np.round(fi.Score.sum(),1), '%')
fi.sort_values('Score', ascending = False)

449 -> 24 non zero important features: 87.7 %


Unnamed: 0,Feature,Score
162,mult_Age_nan_median_with_Sex_dummie_male,15.789627
20,Sex_dummie_female,11.19499
59,sum_Pclass_with_Embarked_nan_dummie_Q,9.430886
302,mult_Fare_nan_min_with_Sex_dummie_male,8.330851
403,sum_Sex_dummie_male_with_Embarked_idxmax_dummie_Q,5.215847
53,sum_Pclass_with_Sex_dummie_male,5.158318
361,sum_Cabin_isnull_with_Sex_dummie_male,3.679292
387,sum_Sex_dummie_female_with_Embarked_idxmax_dum...,3.58962
61,sum_Pclass_with_Embarked_nan_dummie_S,3.255525
362,mult_Cabin_isnull_with_Sex_dummie_male,3.020617


In [7]:
fi.to_csv(KAGGLE_DIR + KAGGLE_PREFIX + '_important_columns.csv', index=False)