In [3]:
import pandas as pd
import numpy as np
from usfull_tools import load_DS
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split
pd.options.display.max_columns = None
%matplotlib inline

from set_vars import KAGGLE_PREFIX, debug_mode, KAGGLE_DIR, target_column, target_type

train, test = load_DS(debug_mode, KAGGLE_DIR, KAGGLE_PREFIX, '_prepare.csv')
del test

X_train, X_test, y_train, y_test = train_test_split(train[train.columns.drop(target_column)], train[target_column], 
                                                    test_size=0.3, random_state=42)
X_train.shape

(623, 449)

## Column prefixes:
-  _nan - original columns with Nan
-  _nan_median, _nan_min, _nan_max, _nan0 - nan replaced by median
-  _idxmax - nan replaced by most frequent
-  _isnull - is null flag
-  _dummie - dumnie for categorical column

In [5]:
iterations = np.round(10 + len(X_train.columns)/20)
if target_type=='binary':
    model = CatBoostClassifier(random_seed = 42, iterations=iterations, depth=2, learning_rate=0.1, 
                               loss_function='CrossEntropy', eval_metric='Accuracy')
    params = ''
elif target_type=='interval':
    model = CatBoostRegressor()
    params = ''
    
#https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list

# Для CatBoost требуется явно указывать категориальные переменные
i=0
cat_features = []
for column in X_train.columns:
    if X_train[column].dtype == 'object': cat_features.append(i)
    i +=1

model.fit(X_train, y_train, cat_features)

from sklearn.metrics import mean_absolute_error, accuracy_score
print("accuracy: %.3f"
      % accuracy_score(model.predict(X_test), y_test))



feature_importance = pd.DataFrame({'columns': X_test.columns,
                                   'importance': model.get_feature_importance()})


# submit['winPlacePercPredict'] = clf.predict(submit[used_cols])
# submit['matchRank'] = submit.groupby('matchId')['winPlacePercPredict'].rank()
# match_players = submit.groupby('matchId')['Id'].count()
# match_step = 1/(match_players+1)

# match_stat = pd.DataFrame({'matchId': match_players.index,
#                            'players': match_players.values,
#                            'matchStep': match_step.values})

# sub = pd.merge(submit, match_stat, on='matchId',how='inner')
# sub.sort_values(['matchId', 'winPlacePercPredict'], inplace=True)
# sub['winPlacePerc'] = (sub['matchRank']-1)*sub['matchStep']
# sub.loc[sub.isnull().any(axis=1), ['winPlacePerc']]=1


# sub.to_csv('submission.csv', columns=['Id', 'winPlacePerc'], index=False, sep=',', decimal='.')

0:	learn: 0.7961477	total: 25.3ms	remaining: 786ms
1:	learn: 0.8105939	total: 46.3ms	remaining: 695ms
2:	learn: 0.8041734	total: 69.5ms	remaining: 672ms
3:	learn: 0.7993579	total: 91.2ms	remaining: 638ms
4:	learn: 0.7977528	total: 112ms	remaining: 607ms
5:	learn: 0.7977528	total: 134ms	remaining: 581ms
6:	learn: 0.7961477	total: 155ms	remaining: 553ms
7:	learn: 0.8009631	total: 175ms	remaining: 526ms
8:	learn: 0.8041734	total: 196ms	remaining: 502ms
9:	learn: 0.8057785	total: 218ms	remaining: 479ms
10:	learn: 0.8170144	total: 247ms	remaining: 472ms
11:	learn: 0.8298555	total: 274ms	remaining: 456ms
12:	learn: 0.8266453	total: 296ms	remaining: 432ms
13:	learn: 0.8234350	total: 317ms	remaining: 407ms
14:	learn: 0.8250401	total: 337ms	remaining: 382ms
15:	learn: 0.8282504	total: 358ms	remaining: 358ms
16:	learn: 0.8298555	total: 397ms	remaining: 350ms
17:	learn: 0.8298555	total: 418ms	remaining: 325ms
18:	learn: 0.8314607	total: 438ms	remaining: 300ms
19:	learn: 0.8314607	total: 461ms	rem

In [7]:
#Keep deatures with > 1% normalize importance
fi = feature_importance[feature_importance.importance > 1]
print(len(X_train.columns), '->', len(fi.index), 'non zero important features:', np.round(fi.importance.sum(),1), '%')
fi.sort_values('importance', ascending = False)

449 -> 17 non zero important features: 83.7 %


Unnamed: 0,columns,importance
162,mult_Age_nan_median_with_Sex_dummie_male,17.259506
52,mult_Pclass_with_Sex_dummie_female,13.098585
224,mult_Age_nan_max_with_Sex_dummie_male,11.634053
361,sum_Cabin_isnull_with_Sex_dummie_male,11.184983
53,sum_Pclass_with_Sex_dummie_male,6.26061
342,mult_Fare_nan_0_with_Sex_dummie_female,4.527975
68,mult_Pclass_with_Embarked_idxmax_dummie_S,4.081789
379,sum_Sex_dummie_female_with_Embarked_nan_dummie_C,2.362827
344,mult_Fare_nan_0_with_Sex_dummie_male,2.112718
67,sum_Pclass_with_Embarked_idxmax_dummie_S,1.808518


In [8]:
fi.to_csv(KAGGLE_DIR + KAGGLE_PREFIX + '_important_columns.csv', index=False)