In [6]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, RFE, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import shuffle

random_seed = 42

In [2]:
# Считывание хороших и плохих контрактов
bad_cntr = pd.read_csv('../data/2-yarobl_bad_contracts.csv')
good_cntr = pd.read_csv('../data/2-yarobl_good_contracts.csv')
data = bad_cntr.append(good_cntr)
data = data.dropna()
data = shuffle(data, random_state=random_seed)
data.head(5)

Unnamed: 0,cntrID,org_cntr_num,org_1s_sev,org_1s_sup_sev,sup_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty,sup_no_pnl_share,sup_okpd_exp,sup_1s_sev,sup_1s_org_sev,sup_sim_price,supType,orgForm,price_higher_pmp,price_too_low,price,pmp,okpd,cntr_result
1701,1489889,2643,0.0,0.0,3,221400.0,0.0,1.0,1.0,0,0.0,0.66667,3,0,0,1,249600.0,484210.0,0,0
3446,1410055,64,0.0,0.0,8,269412.0,0.0,1.0,1.0,0,0.0,0.5,3,13,0,0,379600.0,500000.0,0,1
3388,1394632,37,0.0,0.0,6,611850.0,0.0,1.0,1.0,0,0.0,0.16667,3,13,0,0,280000.0,280000.0,0,1
673,1461932,2697,0.0,0.0,69,158718.0,0.0,1.0,1.0,0,0.0,0.08696,3,13,0,0,144561.1,199395.0,0,0
4047,1547772,33,0.0,0.0,28,137404.0,0.0,1.0,0.60714,0,0.0,0.07143,2,8,0,0,28000.0,39421.44,0,0


In [3]:
data.iloc[:,:].corr()

Unnamed: 0,cntrID,org_cntr_num,org_1s_sev,org_1s_sup_sev,sup_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty,sup_no_pnl_share,sup_okpd_exp,sup_1s_sev,sup_1s_org_sev,sup_sim_price,supType,orgForm,price_higher_pmp,price_too_low,price,pmp,okpd,cntr_result
cntrID,1.0,-0.088172,0.005213,-0.01269,-0.128972,0.002064,0.038342,0.014783,-0.633469,0.011902,0.015638,-0.03046,-0.382673,-0.033345,-0.029965,0.0561,0.003186,0.01777,0.469746,-0.808774
org_cntr_num,-0.088172,1.0,-0.060729,-0.020177,-0.063449,-0.01552,0.037064,0.010245,0.084858,-0.003865,-0.02293,-0.080055,0.113871,-0.003155,0.04425,0.03222,-0.014104,-0.01223,-0.021136,0.075292
org_1s_sev,0.005213,-0.060729,1.0,-0.007043,-0.022742,0.008688,-0.001244,0.010904,0.026591,-0.002581,0.304587,0.042401,0.009673,-0.001691,0.004583,-0.006875,0.009176,0.011841,-0.013372,-0.031655
org_1s_sup_sev,-0.01269,-0.020177,-0.007043,1.0,-0.01372,0.004981,-0.003338,0.005327,0.013089,0.131367,-0.003092,-0.001485,0.013366,0.018344,-0.002858,0.006298,0.006817,0.009548,0.008079,0.019088
sup_cntr_num,-0.128972,-0.063449,-0.022742,-0.01372,1.0,-0.022074,-0.008391,0.005746,0.046278,-0.011289,-0.029232,-0.321372,0.088339,-0.088433,0.00473,-0.044209,-0.022864,-0.023912,-0.091306,0.176932
sup_cntr_avg_price,0.002064,-0.01552,0.008688,0.004981,-0.022074,1.0,-0.001945,0.004711,0.01863,0.013041,0.01092,0.066863,0.013453,0.034422,-0.001424,-0.01053,0.994991,0.781411,0.029381,-0.013065
sup_cntr_avg_penalty,0.038342,0.037064,-0.001244,-0.003338,-0.008391,-0.001945,1.0,-0.082227,-0.049119,-0.001223,-0.003299,-0.029966,0.016746,0.017347,-0.00305,0.019354,-0.002336,-0.002001,0.032603,0.00212
sup_no_pnl_share,0.014783,0.010245,0.010904,0.005327,0.005746,0.004711,-0.082227,1.0,0.073737,0.002055,0.005542,-0.004076,-0.037295,-0.054847,-0.138502,-0.06705,0.00473,0.004809,0.017018,-0.00411
sup_okpd_exp,-0.633469,0.084858,0.026591,0.013089,0.046278,0.01863,-0.049119,0.073737,1.0,0.009201,0.023465,0.236808,0.380569,0.082026,0.001669,-0.029842,0.017082,0.016322,-0.408439,0.419791
sup_1s_sev,0.011902,-0.003865,-0.002581,0.131367,-0.011289,0.013041,-0.001223,0.002055,0.009201,1.0,-0.001133,0.033766,-0.000266,-0.014125,-0.001047,0.010153,0.012947,0.019813,-0.006414,-0.020591


Корреляция отражает линейную зависимость. Наибольшую корреляции с целевой переменной (`cntr_result`) имеют `sup_okpd_exp` (0.42), `supType` (0.33) и `sup_cntr_num` (0.18).

In [4]:
column_names = data.loc[:,data.columns != 'cntr_result'].columns

X = data.loc[:,data.columns != 'cntr_result']
X = X.iloc[:,1:]
y = data.cntr_result.values

In [7]:
model1 = LogisticRegression()
model2 = GradientBoostingClassifier()

NUM_OF_BEST_FEATURES = 5

fit1 = RFE(model1, NUM_OF_BEST_FEATURES).fit(X, y)
fit2 = RFE(model2, NUM_OF_BEST_FEATURES).fit(X, y)

result1 = sorted(
    dict(zip(column_names, fit1.ranking_)).items(),
    key=lambda a: a[1]
)
result2 = sorted(
    dict(zip(column_names, fit2.ranking_)).items(),
    key=lambda a: a[1]
)

print('НАИБОЛЕЕ важные переменные для LogisticRegression:')
print([var[0] for var in result1[:NUM_OF_BEST_FEATURES]])
print('НАИБОЛЕЕ важные переменные для GradientBoostingClassifier:')
print([var[0] for var in result2[:NUM_OF_BEST_FEATURES]])

print('\nНАИМЕНЕЕ важные переменные для LogisticRegression:')
print([var[0] for var in result1[-NUM_OF_BEST_FEATURES:]])
print('НАИМЕНЕЕ важные переменные для GradientBoostingClassifier:')
print([var[0] for var in result2[-NUM_OF_BEST_FEATURES:]])

НАИБОЛЕЕ важные переменные для LogisticRegression:
['cntrID', 'sup_cntr_num', 'price_too_low', 'price', 'pmp']
НАИБОЛЕЕ важные переменные для GradientBoostingClassifier:
['cntrID', 'org_1s_sup_sev', 'sup_cntr_num', 'sup_no_pnl_share', 'supType']

НАИМЕНЕЕ важные переменные для LogisticRegression:
['price_higher_pmp', 'sup_okpd_exp', 'sup_cntr_avg_price', 'org_cntr_num', 'org_1s_sev']
НАИМЕНЕЕ важные переменные для GradientBoostingClassifier:
['org_1s_sev', 'sup_cntr_avg_price', 'orgForm', 'price_higher_pmp', 'sup_okpd_exp']


### Выбор признаков

In [4]:
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, y)
result = sorted(
    dict(zip(data.iloc[:,1:-1].columns, [int(e) for e in list(fit.scores_)])).items(),
    key=lambda a: a[1],
    reverse=True
)

In [5]:
result

[('sup_cntr_avg_price', 8892454),
 ('org_cntr_num', 544717),
 ('sup_cntr_num', 1063),
 ('price_higher_pmp', 430),
 ('sup_okpd_exp', 274),
 ('sup_sim_price', 39),
 ('sup_1s_org_sev', 27),
 ('price_too_low', 11),
 ('sup_1s_sev', 5),
 ('org_1s_sev', 0),
 ('org_1s_sup_sev', 0),
 ('sup_cntr_avg_penalty', 0),
 ('sup_no_pnl_share', 0)]

In [6]:
print('Самые важные признаки: {}'.format(', '.join(elem[0] for elem in result[:3])))

Самые важные признаки: sup_cntr_avg_price, org_cntr_num, sup_cntr_num


In [7]:
model = LogisticRegression()

NUM_OF_BEST_FEATURES = 3
fit = RFE(model, NUM_OF_BEST_FEATURES).fit(X, y)

result = sorted(
    dict(zip(column_names, fit.ranking_)).items(),
    key=lambda a: a[1]
)

In [8]:
result

[('sup_no_pnl_share', 1),
 ('sup_okpd_exp', 1),
 ('sup_1s_org_sev', 1),
 ('price_higher_pmp', 2),
 ('sup_1s_sev', 3),
 ('price_too_low', 4),
 ('org_1s_sev', 5),
 ('sup_sim_price', 6),
 ('sup_cntr_avg_penalty', 7),
 ('sup_cntr_num', 8),
 ('org_cntr_num', 9),
 ('sup_cntr_avg_price', 10),
 ('org_1s_sup_sev', 11)]