In [36]:
import pandas as pd
import numpy as np
from usfull_tools import load_DS
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split
pd.options.display.max_columns = None
%matplotlib inline

from set_vars import KAGGLE_PREFIX, debug_mode, KAGGLE_DIR, target_column, target_type, loss_function, custom_metric

train, test = load_DS(debug_mode, KAGGLE_DIR, KAGGLE_PREFIX, '_prepare.csv')
del test

X_train, X_test, y_train, y_test = train_test_split(train[train.columns.drop(target_column)], train[target_column], 
                                                    test_size=0.3, random_state=42)
X_train.shape

(1022, 247)

In [2]:
iterations = np.round(10+len(X_train.columns)/2)
iterations = 1000


print('iterations :', iterations)

if target_type=='binary':
    model = CatBoostClassifier(random_seed = 42, iterations=iterations, depth=2, learning_rate=0.1, 
                               loss_function=loss_function, custom_metric=custom_metric, od_pval = 1e-3
#                               , task_type='GPU', devices='0'
                              )
elif target_type=='interval':
    model = CatBoostRegressor(random_seed = 42, iterations=iterations, depth=6, learning_rate=0.1, 
                              loss_function=loss_function, custom_metric=custom_metric, od_pval = 1e-3
#                                , task_type='GPU', devices='0'
                              )
    
#https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list

# Для CatBoost требуется явно указывать категориальные переменные
i=0
cat_features = []
for column in X_train.columns:
    if X_train[column].dtype == 'object': cat_features.append(i)
    i +=1

model.fit(X_train, y_train, cat_features)

from sklearn.metrics import mean_absolute_error, accuracy_score

if custom_metric=='Accuracy':
    print("Accuracy: %.3f"
          % accuracy_score(model.predict(X_test), y_test))

if custom_metric=='RMSE':
    print("RMSE: %.3f"
          % mean_absolute_error(model.predict(X_test), y_test))


feature_importance = pd.DataFrame(list(zip(X_test.dtypes.index, 
                                           model.get_feature_importance(Pool(X_test, label=y_test, cat_features=cat_features)))),
                                    columns=['Feature','Score'])

feature_importance = feature_importance.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')

#TO DO: сделать отбор лучшего из нескольких корелирующих параметров

iterations : 1000




0:	learn: 179806.1373854	total: 434ms	remaining: 7m 13s
1:	learn: 164097.1914376	total: 791ms	remaining: 6m 34s
2:	learn: 150082.1318647	total: 1.19s	remaining: 6m 34s
3:	learn: 137266.9074586	total: 1.52s	remaining: 6m 17s
4:	learn: 125617.9008538	total: 1.89s	remaining: 6m 16s
5:	learn: 114969.9123178	total: 2.25s	remaining: 6m 13s
6:	learn: 105646.3296720	total: 2.56s	remaining: 6m 3s
7:	learn: 97147.8517365	total: 2.88s	remaining: 5m 57s
8:	learn: 89703.1702257	total: 3.19s	remaining: 5m 50s
9:	learn: 82775.3445254	total: 3.5s	remaining: 5m 46s
10:	learn: 76592.1578867	total: 3.89s	remaining: 5m 49s
11:	learn: 71106.8074192	total: 4.28s	remaining: 5m 52s
12:	learn: 66184.6406316	total: 4.61s	remaining: 5m 50s
13:	learn: 61845.3989619	total: 4.94s	remaining: 5m 47s
14:	learn: 57786.0513013	total: 5.16s	remaining: 5m 38s
15:	learn: 54403.6004857	total: 5.45s	remaining: 5m 34s
16:	learn: 51305.5252559	total: 5.78s	remaining: 5m 34s
17:	learn: 48770.5029276	total: 6.21s	remaining: 5m 3

147:	learn: 21416.6488984	total: 47s	remaining: 4m 30s
148:	learn: 21406.8741153	total: 47.4s	remaining: 4m 30s
149:	learn: 21405.6273642	total: 47.7s	remaining: 4m 30s
150:	learn: 21400.8608972	total: 48s	remaining: 4m 30s
151:	learn: 21382.5016706	total: 48.4s	remaining: 4m 29s
152:	learn: 21361.0790479	total: 48.7s	remaining: 4m 29s
153:	learn: 21349.6463744	total: 49s	remaining: 4m 29s
154:	learn: 21340.1369481	total: 49.4s	remaining: 4m 29s
155:	learn: 21334.2813152	total: 49.7s	remaining: 4m 28s
156:	learn: 21240.1201207	total: 50s	remaining: 4m 28s
157:	learn: 21224.0405036	total: 50.4s	remaining: 4m 28s
158:	learn: 21156.1557660	total: 50.7s	remaining: 4m 28s
159:	learn: 21120.6071879	total: 51s	remaining: 4m 27s
160:	learn: 21117.5996483	total: 51.3s	remaining: 4m 27s
161:	learn: 21112.8794186	total: 51.7s	remaining: 4m 27s
162:	learn: 21086.9976730	total: 52s	remaining: 4m 27s
163:	learn: 21084.6560950	total: 52.4s	remaining: 4m 27s
164:	learn: 21076.4965581	total: 52.7s	rema

291:	learn: 18840.8280666	total: 1m 35s	remaining: 3m 51s
292:	learn: 18837.0356287	total: 1m 35s	remaining: 3m 50s
293:	learn: 18835.5224103	total: 1m 36s	remaining: 3m 50s
294:	learn: 18825.1848658	total: 1m 36s	remaining: 3m 50s
295:	learn: 18816.9433073	total: 1m 36s	remaining: 3m 50s
296:	learn: 18808.8232617	total: 1m 37s	remaining: 3m 49s
297:	learn: 18808.3832460	total: 1m 37s	remaining: 3m 49s
298:	learn: 18792.3075128	total: 1m 37s	remaining: 3m 49s
299:	learn: 18791.9087221	total: 1m 38s	remaining: 3m 48s
300:	learn: 18788.0124201	total: 1m 38s	remaining: 3m 48s
301:	learn: 18773.4785436	total: 1m 38s	remaining: 3m 48s
302:	learn: 18766.4450542	total: 1m 39s	remaining: 3m 48s
303:	learn: 18765.5892781	total: 1m 39s	remaining: 3m 47s
304:	learn: 18763.9666138	total: 1m 39s	remaining: 3m 47s
305:	learn: 18760.8606153	total: 1m 40s	remaining: 3m 47s
306:	learn: 18756.8223558	total: 1m 40s	remaining: 3m 47s
307:	learn: 18705.1522662	total: 1m 40s	remaining: 3m 46s
308:	learn: 18

433:	learn: 17612.4248760	total: 2m 26s	remaining: 3m 10s
434:	learn: 17607.7527010	total: 2m 26s	remaining: 3m 10s
435:	learn: 17597.5169201	total: 2m 27s	remaining: 3m 10s
436:	learn: 17593.8847684	total: 2m 27s	remaining: 3m 9s
437:	learn: 17593.4778153	total: 2m 27s	remaining: 3m 9s
438:	learn: 17593.0446343	total: 2m 28s	remaining: 3m 9s
439:	learn: 17592.4177587	total: 2m 28s	remaining: 3m 8s
440:	learn: 17592.1184003	total: 2m 28s	remaining: 3m 8s
441:	learn: 17588.3688410	total: 2m 29s	remaining: 3m 8s
442:	learn: 17586.7374105	total: 2m 29s	remaining: 3m 7s
443:	learn: 17582.6456323	total: 2m 29s	remaining: 3m 7s
444:	learn: 17581.6739310	total: 2m 29s	remaining: 3m 7s
445:	learn: 17581.4614553	total: 2m 30s	remaining: 3m 6s
446:	learn: 17570.5768683	total: 2m 30s	remaining: 3m 6s
447:	learn: 17570.1891305	total: 2m 30s	remaining: 3m 6s
448:	learn: 17567.2680524	total: 2m 31s	remaining: 3m 5s
449:	learn: 17562.3635990	total: 2m 31s	remaining: 3m 5s
450:	learn: 17560.2472554	to

576:	learn: 16683.0718624	total: 3m 17s	remaining: 2m 25s
577:	learn: 16682.6851955	total: 3m 18s	remaining: 2m 24s
578:	learn: 16680.5270966	total: 3m 18s	remaining: 2m 24s
579:	learn: 16679.9993447	total: 3m 18s	remaining: 2m 23s
580:	learn: 16679.1575437	total: 3m 19s	remaining: 2m 23s
581:	learn: 16677.4169746	total: 3m 19s	remaining: 2m 23s
582:	learn: 16671.3095001	total: 3m 19s	remaining: 2m 22s
583:	learn: 16669.2220027	total: 3m 20s	remaining: 2m 22s
584:	learn: 16667.8940332	total: 3m 20s	remaining: 2m 22s
585:	learn: 16667.4259888	total: 3m 20s	remaining: 2m 21s
586:	learn: 16666.7163558	total: 3m 21s	remaining: 2m 21s
587:	learn: 16664.4845094	total: 3m 21s	remaining: 2m 21s
588:	learn: 16664.0345968	total: 3m 21s	remaining: 2m 20s
589:	learn: 16662.5032756	total: 3m 22s	remaining: 2m 20s
590:	learn: 16613.9529964	total: 3m 22s	remaining: 2m 20s
591:	learn: 16608.9173979	total: 3m 22s	remaining: 2m 19s
592:	learn: 16607.5042253	total: 3m 23s	remaining: 2m 19s
593:	learn: 16

719:	learn: 16160.8179947	total: 4m 7s	remaining: 1m 36s
720:	learn: 16160.1381270	total: 4m 7s	remaining: 1m 35s
721:	learn: 16158.6281250	total: 4m 8s	remaining: 1m 35s
722:	learn: 16157.3023972	total: 4m 8s	remaining: 1m 35s
723:	learn: 16157.1469856	total: 4m 8s	remaining: 1m 34s
724:	learn: 16156.5841451	total: 4m 9s	remaining: 1m 34s
725:	learn: 16155.1673779	total: 4m 9s	remaining: 1m 34s
726:	learn: 16154.6322889	total: 4m 9s	remaining: 1m 33s
727:	learn: 16150.6708413	total: 4m 10s	remaining: 1m 33s
728:	learn: 16150.2779139	total: 4m 10s	remaining: 1m 33s
729:	learn: 16149.9251818	total: 4m 10s	remaining: 1m 32s
730:	learn: 16149.5505057	total: 4m 11s	remaining: 1m 32s
731:	learn: 16149.1209233	total: 4m 11s	remaining: 1m 32s
732:	learn: 16136.4142892	total: 4m 11s	remaining: 1m 31s
733:	learn: 16135.7281777	total: 4m 12s	remaining: 1m 31s
734:	learn: 16135.4841035	total: 4m 12s	remaining: 1m 31s
735:	learn: 16132.4040010	total: 4m 13s	remaining: 1m 30s
736:	learn: 16132.0365

862:	learn: 15685.2413804	total: 4m 57s	remaining: 47.2s
863:	learn: 15676.5463505	total: 4m 57s	remaining: 46.9s
864:	learn: 15664.4729825	total: 4m 58s	remaining: 46.5s
865:	learn: 15662.6437670	total: 4m 58s	remaining: 46.2s
866:	learn: 15662.4545164	total: 4m 58s	remaining: 45.8s
867:	learn: 15661.3880559	total: 4m 59s	remaining: 45.5s
868:	learn: 15661.0019410	total: 4m 59s	remaining: 45.1s
869:	learn: 15660.7073757	total: 4m 59s	remaining: 44.8s
870:	learn: 15659.9061981	total: 5m	remaining: 44.4s
871:	learn: 15659.8629657	total: 5m	remaining: 44.1s
872:	learn: 15659.1608057	total: 5m	remaining: 43.8s
873:	learn: 15658.7604240	total: 5m 1s	remaining: 43.4s
874:	learn: 15656.2389269	total: 5m 1s	remaining: 43.1s
875:	learn: 15651.1145222	total: 5m 1s	remaining: 42.7s
876:	learn: 15650.8611195	total: 5m 2s	remaining: 42.4s
877:	learn: 15648.3358290	total: 5m 2s	remaining: 42s
878:	learn: 15641.4261321	total: 5m 2s	remaining: 41.7s
879:	learn: 15641.1997582	total: 5m 3s	remaining: 4

In [23]:
#Keep features with > 1% normalize importance
fi = feature_importance[feature_importance.Score > 0.1]
print(len(X_train.columns), '->', len(fi.index), 'non zero important features:', np.round(fi.Score.sum(),1), '%')
fi.sort_values('Score', ascending = False)

417 -> 107 non zero important features: 96.9 %


Unnamed: 0,Feature,Score
130,sum_OverallQual_log_PoolQC,11.016924
122,multiply_OverallQual_GarageCars,6.332999
197,multiply_GrLivArea_GarageCars,4.974448
112,division_OverallQual_YearRemodAdd,4.651936
195,multiply_GrLivArea_FullBath,3.668617
121,multiply_OverallQual_GarageYrBlt,3.607248
115,multiply_OverallQual_TotalBsmtSF,3.174275
140,division_YearRemodAdd_GrLivArea,3.125485
180,multiply_1stFlrSF_TotRmsAbvGrd,2.780085
126,multiply_OverallQual_log_LotArea,2.649034


In [37]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, LogisticRegression

if target_type=='binary':
    model = LogisticRegression(solver='liblinear')
elif target_type=='interval':
    model = LinearRegression()

X_train = X_train[fi.Feature.tolist()]
for column in X_train.columns:
    if X_train[column].dtype == object:
        if X_train[column].nunique() < 20:
            dummies = pd.get_dummies(X_train[column], prefix = str(column + '_dummie'))
            X_train = pd.concat([X_train, dummies], axis=1)
        X_train.drop(column, axis=1, inplace=True)

selector = RFE(model)
selector = selector.fit(X_train, y_train)

In [42]:
rfe_score = pd.DataFrame({
    'support': selector.support_,
    'rank': selector.ranking_,
    'Feature': X_train.columns
})

[ True  True False  True False False False  True False  True False False
 False  True  True False False False False False  True  True False  True
 False  True  True False False False False False False  True  True  True
  True False  True  True  True False False  True False  True  True False
 False  True  True False  True False  True False False  True  True False
 False False  True  True  True False  True  True False  True  True False
 False  True False False False  True False False  True  True  True  True
 False  True False False False False  True  True  True False False  True
  True False  True False  True False False False  True  True  True  True
  True  True  True False False False  True  True False False  True False
  True  True False False  True False  True  True False  True False False
 False  True False False False  True  True  True  True  True  True False]
[ 1  1 50  1 53 51 57  1 59  1 65 28 20  1  1 56  5 66 45 68  1  1  9  1
 40  1  1 55 58 60 49 29  7  1  1  1  1 52  1  1  

In [46]:
rfe_score = rfe_score.sort_values('rank', ascending = True)
fi = rfe_score[rfe_score['rank'] == 1]

In [47]:
fi.to_csv(KAGGLE_DIR + KAGGLE_PREFIX + '_important_columns.csv', index=False)