In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, RFE, chi2
from sklearn.linear_model import LogisticRegression

In [2]:
# Считывание хороших и плохих контрактов
bad_cntr = pd.read_csv('../data/yarobl_bad_contracts_plus.csv')
good_cntr = pd.read_csv('../data/yaroblmz_good_contracts.csv')
data = bad_cntr.append(good_cntr)
data = data.dropna()
print(data.shape)

(11862, 15)


In [3]:
column_names = data.iloc[:,1:-1].columns
X = data.iloc[:,1:-1]
y = data.cntr_result.values

### Выбор признаков

In [4]:
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, y)
result = sorted(
    dict(zip(data.iloc[:,1:-1].columns, [int(e) for e in list(fit.scores_)])).items(),
    key=lambda a: a[1],
    reverse=True
)

In [5]:
result

[('sup_cntr_avg_price', 8892454),
 ('org_cntr_num', 544717),
 ('sup_cntr_num', 1063),
 ('price_higher_pmp', 430),
 ('sup_okpd_exp', 274),
 ('sup_sim_price', 39),
 ('sup_1s_org_sev', 27),
 ('price_too_low', 11),
 ('sup_1s_sev', 5),
 ('org_1s_sev', 0),
 ('org_1s_sup_sev', 0),
 ('sup_cntr_avg_penalty', 0),
 ('sup_no_pnl_share', 0)]

In [6]:
print('Самые важные признаки: {}'.format(', '.join(elem[0] for elem in result[:3])))

Самые важные признаки: sup_cntr_avg_price, org_cntr_num, sup_cntr_num


In [7]:
model = LogisticRegression()

NUM_OF_BEST_FEATURES = 3
fit = RFE(model, NUM_OF_BEST_FEATURES).fit(X, y)

result = sorted(
    dict(zip(column_names, fit.ranking_)).items(),
    key=lambda a: a[1]
)

In [8]:
result

[('sup_no_pnl_share', 1),
 ('sup_okpd_exp', 1),
 ('sup_1s_org_sev', 1),
 ('price_higher_pmp', 2),
 ('sup_1s_sev', 3),
 ('price_too_low', 4),
 ('org_1s_sev', 5),
 ('sup_sim_price', 6),
 ('sup_cntr_avg_penalty', 7),
 ('sup_cntr_num', 8),
 ('org_cntr_num', 9),
 ('sup_cntr_avg_price', 10),
 ('org_1s_sup_sev', 11)]