In [1]:
import pandas as pd
from gpalib import preprocessing

In [2]:
"""Display settings"""
pd.options.display.max_columns = 300

### Step 1. Adding column names

In [3]:
%%time
# Reading data
data = pd.read_csv('../data/russia-16-19-v2.csv', sep=';', header=None)

# Adding columns names from another dataset
data.columns = list(pd.read_csv('../data/kakiningrad-16-19.csv', sep=';').columns) + ['org_ter', 'sup_ter']

# Saving new dataset
data.to_csv('../data/russia-16-19-v2.1.csv', index=False)

CPU times: user 29.6 s, sys: 970 ms, total: 30.6 s
Wall time: 32.3 s


In [4]:
print(data.shape)
data.head()

(572066, 45)


Unnamed: 0,valID,cntrID,supID,orgID,okpdID,cntr_reg_num,sup_cntr_num,sup_running_cntr_num,sup_good_cntr_num,sup_fed_cntr_num,sup_sub_cntr_num,sup_mun_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_1s_sev,sup_1s_org_sev,sup_okpd_cntr_num,sup_sim_price_share,org_cntr_num,org_running_cntr_num,org_good_cntr_num,org_fed_cntr_num,org_sub_cntr_num,org_mun_cntr_num,org_cntr_avg_price,org_1s_sev,org_1s_sup_sev,org_sim_price_share,cntr_num_together,org_type,okpd_cntr_num,okpd_good_cntr_num,okpd,price,pmp,cntr_lvl,sign_date,exec_date,purch_type,price_higher_pmp,price_too_low,cntr_result,org_ter,sup_ter
0,58119014,39509208,10474056,370745,157035,2644600050617000034,339,66,339,1,338,0,262835,0,1,0,0,134,239,269,164,265,0,269,0,123302,0,0,149,2,16,1548.0,2295,620111000,79577,214550,2,20170203,20170630,1,0,1,0,67600.0,
1,58803716,40604578,10399429,430466,151412,2622903819617000075,47,25,45,5,42,0,188341,0,1,0,0,3,106,217,175,183,0,217,0,447034,0,0,97,1,16,26670.0,41325,202014000,435176,486235,2,20170427,20180131,3,0,0,1,52109.0,
2,58862275,39272060,10242364,414245,155877,3463002813017000005,547,503,196,84,120,343,139761,0,1,0,0,26,113,19,14,3,0,0,19,147610,0,0,105,1,16,3630.0,6679,360020110,24686,41343,3,20170120,20170228,6,0,1,1,52598.0,
3,58862275,39272060,10242364,414245,155879,3463002813017000005,547,503,196,84,120,343,139761,0,1,0,0,71,113,19,14,3,0,0,19,147610,0,0,105,1,16,20298.0,37070,360020130,24686,41343,3,20170120,20170228,6,0,1,1,52598.0,
4,59321033,39527446,11038878,465551,156057,1254001576717000022,1,0,0,1,0,0,123280928,0,1,0,0,1,1,215,81,210,215,0,0,1152392,0,0,5,1,30,5030.0,8097,412040000,123280928,123280928,1,20170207,20171231,3,0,0,1,70162.0,


### Step 2. Aggregating data for shortened OKPD

In [5]:
OKPD_SYM_TO_SAVE = 2
OKPD_COLUMN_NAME = 'okpd{}'.format(OKPD_SYM_TO_SAVE)

In [6]:
%%time
data = pd.read_csv('../data/russia-16-19-v2.1.csv')
data = preprocessing.aggregate_data_by_shortened_okpd(data, okpd_sym=OKPD_SYM_TO_SAVE)
data.to_csv('../data/russia-16-19-v2.2.csv', index=False)

New column `okpd2` is added
Data for columns `okpd_cntr_num` and `okpd_good_cntr_num` is aggregated
Data for column `sup_okpd_cntr_num` is aggregated

CPU times: user 52.1 s, sys: 2.36 s, total: 54.5 s
Wall time: 46.7 s


In [7]:
print(data.shape)
data.head()

(572066, 46)


Unnamed: 0,valID,cntrID,supID,orgID,okpdID,cntr_reg_num,sup_cntr_num,sup_running_cntr_num,sup_good_cntr_num,sup_fed_cntr_num,sup_sub_cntr_num,sup_mun_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_1s_sev,sup_1s_org_sev,sup_okpd_cntr_num,sup_sim_price_share,org_cntr_num,org_running_cntr_num,org_good_cntr_num,org_fed_cntr_num,org_sub_cntr_num,org_mun_cntr_num,org_cntr_avg_price,org_1s_sev,org_1s_sup_sev,org_sim_price_share,cntr_num_together,org_type,okpd_cntr_num,okpd_good_cntr_num,okpd,price,pmp,cntr_lvl,sign_date,exec_date,purch_type,price_higher_pmp,price_too_low,cntr_result,org_ter,sup_ter,okpd2
0,58119014,39509208,10474056,370745,157035,2644600050617000034,339,66,339,1,338,0,262835,0,1,0,0,216,239,269,164,265,0,269,0,123302,0,0,149,2,16,20014.0,31162,620111000,79577,214550,2,20170203,20170630,1,0,1,0,67600.0,,62
1,58803716,40604578,10399429,430466,151412,2622903819617000075,47,25,45,5,42,0,188341,0,1,0,0,20,106,217,175,183,0,217,0,447034,0,0,97,1,16,400901.0,634822,202014000,435176,486235,2,20170427,20180131,3,0,0,1,52109.0,,20
2,58862275,39272060,10242364,414245,155877,3463002813017000005,547,503,196,84,120,343,139761,0,1,0,0,167,113,19,14,3,0,0,19,147610,0,0,105,1,16,47875.0,85361,360020110,24686,41343,3,20170120,20170228,6,0,1,1,52598.0,,36
3,58862275,39272060,10242364,414245,155879,3463002813017000005,547,503,196,84,120,343,139761,0,1,0,0,167,113,19,14,3,0,0,19,147610,0,0,105,1,16,47875.0,85361,360020130,24686,41343,3,20170120,20170228,6,0,1,1,52598.0,,36
4,59321033,39527446,11038878,465551,156057,1254001576717000022,1,0,0,1,0,0,123280928,0,1,0,0,1,1,215,81,210,215,0,0,1152392,0,0,5,1,30,11457.0,18634,412040000,123280928,123280928,1,20170207,20171231,3,0,0,1,70162.0,,41


### Step 3. Flattening data: one contract = one row

In [8]:
%%time
data = pd.read_csv('../data/russia-16-19-v2.2.csv')
data = preprocessing.flatten_data(data, OKPD_COLUMN_NAME, debug=True)
data.to_csv('../data/russia-16-19-v2.3.csv', index=False)

New variable `okpd_num` was created
Dummy variables for `okpd2` and `sup_okpd_contract_share` were created
Data was flattened: one contract = one row
New variables (min, mean, max) instead of `okpd_good_cntr_share` were created
`socs_` variables were updated

CPU times: user 2min 52s, sys: 8.78 s, total: 3min 1s
Wall time: 2min 56s


In [9]:
print(data.shape)
data.head()

(308273, 213)


Unnamed: 0,valID,cntrID,supID,orgID,okpdID,cntr_reg_num,sup_cntr_num,sup_running_cntr_num,sup_good_cntr_num,sup_fed_cntr_num,sup_sub_cntr_num,sup_mun_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_1s_sev,sup_1s_org_sev,sup_okpd_cntr_num,sup_sim_price_share,org_cntr_num,org_running_cntr_num,org_good_cntr_num,org_fed_cntr_num,org_sub_cntr_num,org_mun_cntr_num,org_cntr_avg_price,org_1s_sev,org_1s_sup_sev,org_sim_price_share,cntr_num_together,org_type,okpd_good_cntr_num,okpd_cntr_num,okpd,price,pmp,cntr_lvl,sign_date,exec_date,purch_type,price_higher_pmp,price_too_low,cntr_result,org_ter,sup_ter,okpd2,sup_okpd_cntr_share,okpd_num,okpd2_0,okpd2_10,okpd2_11,okpd2_12,okpd2_13,okpd2_14,okpd2_15,okpd2_16,okpd2_17,okpd2_18,okpd2_19,okpd2_20,okpd2_21,okpd2_22,okpd2_23,okpd2_24,okpd2_25,okpd2_26,okpd2_27,okpd2_28,okpd2_29,okpd2_30,okpd2_31,okpd2_32,okpd2_33,okpd2_35,okpd2_36,okpd2_37,okpd2_38,okpd2_39,okpd2_41,okpd2_42,okpd2_43,okpd2_45,okpd2_46,okpd2_47,okpd2_49,okpd2_50,okpd2_51,okpd2_52,okpd2_53,okpd2_55,okpd2_56,okpd2_58,okpd2_59,okpd2_60,okpd2_61,okpd2_62,okpd2_63,okpd2_64,okpd2_65,okpd2_66,okpd2_68,okpd2_69,okpd2_70,okpd2_71,okpd2_72,okpd2_73,okpd2_74,okpd2_75,okpd2_77,okpd2_78,okpd2_79,okpd2_80,okpd2_81,okpd2_82,okpd2_84,okpd2_85,okpd2_86,okpd2_87,okpd2_88,okpd2_89,okpd2_90,okpd2_91,okpd2_93,okpd2_94,okpd2_95,okpd2_96,okpd2_97,okpd2_98,okpd2_99,socs_0,socs_10,socs_11,socs_12,socs_13,socs_14,socs_15,socs_16,socs_17,socs_18,socs_19,socs_20,socs_21,socs_22,socs_23,socs_24,socs_25,socs_26,socs_27,socs_28,socs_29,socs_30,socs_31,socs_32,socs_33,socs_35,socs_36,socs_37,socs_38,socs_39,socs_41,socs_42,socs_43,socs_45,socs_46,socs_47,socs_49,socs_50,socs_51,socs_52,socs_53,socs_55,socs_56,socs_58,socs_59,socs_60,socs_61,socs_62,socs_63,socs_64,socs_65,socs_66,socs_68,socs_69,socs_70,socs_71,socs_72,socs_73,socs_74,socs_75,socs_77,socs_78,socs_79,socs_80,socs_81,socs_82,socs_84,socs_85,socs_86,socs_87,socs_88,socs_89,socs_90,socs_91,socs_93,socs_94,socs_95,socs_96,socs_97,socs_98,socs_99,okpd_good_share_min,okpd_good_share_mean,okpd_good_share_max
0,368554944,42093110,10321078,99737,151706,2540412116917000316,10,9,10,1,9,0,154856,0,1,0,0,8,2,917,326,907,0,917,0,552096,0,0,8,1,16,645600.0,1093576,212010134,113118,0,2,20170830,20171231,3,0,0,0,67162.0,,21,0.8,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.590357,0.590357,0.590357
1,368560631,40756905,11013129,101618,150380,3542010069517000022,60,91,60,1,30,29,54153,0,1,0,0,128,1,60,36,56,0,0,60,299688,0,0,15,1,30,225244.0,363730,171214119,58329,0,3,20170511,20171231,3,0,0,0,67162.0,46302.0,17,2.133333,21,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.133333,0.0,0.0,0.0,0.0,0.966667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.616669,0.630042,0.650133
2,430593247,39611403,4015905,491784,147660,3615405795317000010,141,96,96,0,38,103,84338,0,1,0,0,101,57,167,154,146,0,0,167,233252,0,0,114,9,16,276547.0,537141,101111110,96004,0,3,20170214,20171230,4,0,0,0,52550.0,52550.0,10,0.716312,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.716312,0.070922,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.51485,0.51485,0.51485
3,430649018,43087270,10403158,492691,159312,3612200351417000152,10,6,8,0,3,7,3108779,0,1,0,0,6,1,316,141,209,0,0,316,293409,0,0,66,2,16,5282.0,8539,960119115,214400,0,3,20171103,20171231,3,0,0,1,52550.0,46302.0,96,0.6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.618574,0.618574,0.618574
4,380526264,39521403,11024951,210669,156853,3230602107217000005,13,5,12,0,1,12,490886,0,1,0,0,12,231,5,9,1,0,0,5,265952,0,0,2,1,16,7369.0,13565,562920120,310086,0,3,20170125,20171229,3,0,0,1,67175.0,,56,0.923077,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.923077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.543236,0.543236,0.543236


### Step 4. Adding some features and deleting unneeded

In [10]:
%%time
data = pd.read_csv('../data/russia-16-19-v2.3.csv')
data = preprocessing.basic_feature_engineering(data, OKPD_COLUMN_NAME)
data.to_csv('../data/russia-16-19-v2.4.csv', index=False)

4 new variables were created: `plan_cntr_len`, `day_price`, `sign_month`, `sign_quarter`. Data shape: (308273, 217)
8 new variables were created: `sup_good_cntr_share`, `sup_fed_cntr_share`, `sup_sub_cntr_share` and `sup_mun_cntr_share` and the same for customer. Data shape: (308273, 225)
20 variables were deleted: `valID`, `supID`, `orgID`, `okpdID`, `cntr_reg_num`, `okpd`, `okpd2`, `sup_good_cntr_num`, `sup_fed_cntr_num`, `sup_sub_cntr_num`, `sup_mun_cntr_num`, `sup_okpd_cntr_num`, `sup_okpd_cntr_share`, `org_good_cntr_num`, `org_fed_cntr_num`, `org_sub_cntr_num`, `org_mun_cntr_num`, `okpd_good_cntr_num`, `sign_date`, `exec_date`. Data shape: (308273, 205)
Columns were reordered

CPU times: user 1min 7s, sys: 5.25 s, total: 1min 12s
Wall time: 1min 9s


In [11]:
print(data.shape)
data.head()

(308273, 205)


Unnamed: 0,cntrID,sup_cntr_num,sup_running_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_1s_sev,sup_1s_org_sev,sup_sim_price_share,sup_ter,sup_good_cntr_share,sup_fed_cntr_share,sup_sub_cntr_share,sup_mun_cntr_share,org_cntr_num,org_running_cntr_num,org_cntr_avg_price,org_1s_sev,org_1s_sup_sev,org_sim_price_share,org_type,org_ter,org_good_cntr_share,org_fed_cntr_share,org_sub_cntr_share,org_mun_cntr_share,cntr_num_together,okpd_cntr_num,okpd_good_share_min,okpd_good_share_mean,okpd_good_share_max,sign_month,sign_quarter,cntr_okpd_num,plan_cntr_len,day_price,purch_type,cntr_lvl,price_higher_pmp,price_too_low,pmp,price,okpd2_0,okpd2_10,okpd2_11,okpd2_12,okpd2_13,okpd2_14,okpd2_15,okpd2_16,okpd2_17,okpd2_18,okpd2_19,okpd2_20,okpd2_21,okpd2_22,okpd2_23,okpd2_24,okpd2_25,okpd2_26,okpd2_27,okpd2_28,okpd2_29,okpd2_30,okpd2_31,okpd2_32,okpd2_33,okpd2_35,okpd2_36,okpd2_37,okpd2_38,okpd2_39,okpd2_41,okpd2_42,okpd2_43,okpd2_45,okpd2_46,okpd2_47,okpd2_49,okpd2_50,okpd2_51,okpd2_52,okpd2_53,okpd2_55,okpd2_56,okpd2_58,okpd2_59,okpd2_60,okpd2_61,okpd2_62,okpd2_63,okpd2_64,okpd2_65,okpd2_66,okpd2_68,okpd2_69,okpd2_70,okpd2_71,okpd2_72,okpd2_73,okpd2_74,okpd2_75,okpd2_77,okpd2_78,okpd2_79,okpd2_80,okpd2_81,okpd2_82,okpd2_84,okpd2_85,okpd2_86,okpd2_87,okpd2_88,okpd2_89,okpd2_90,okpd2_91,okpd2_93,okpd2_94,okpd2_95,okpd2_96,okpd2_97,okpd2_98,okpd2_99,socs_0,socs_10,socs_11,socs_12,socs_13,socs_14,socs_15,socs_16,socs_17,socs_18,socs_19,socs_20,socs_21,socs_22,socs_23,socs_24,socs_25,socs_26,socs_27,socs_28,socs_29,socs_30,socs_31,socs_32,socs_33,socs_35,socs_36,socs_37,socs_38,socs_39,socs_41,socs_42,socs_43,socs_45,socs_46,socs_47,socs_49,socs_50,socs_51,socs_52,socs_53,socs_55,socs_56,socs_58,socs_59,socs_60,socs_61,socs_62,socs_63,socs_64,socs_65,socs_66,socs_68,socs_69,socs_70,socs_71,socs_72,socs_73,socs_74,socs_75,socs_77,socs_78,socs_79,socs_80,socs_81,socs_82,socs_84,socs_85,socs_86,socs_87,socs_88,socs_89,socs_90,socs_91,socs_93,socs_94,socs_95,socs_96,socs_97,socs_98,socs_99,cntr_result
0,42093110,10,9,154856,0.0,1,0,0,0.2,,1.0,0.1,0.9,0.0,917,326,552096,0,0,0.08,16,67162.0,0.989095,0.0,1.0,0.0,1,1093576,0.590357,0.590357,0.590357,8,3,1,123,919.658537,3,2,0,0,0,113118,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,40756905,60,91,54153,0.0,1,0,0,0.1,46302.0,1.0,0.016667,0.5,0.483333,60,36,299688,0,0,0.15,30,67162.0,0.933333,0.0,0.0,1.0,1,363730,0.616669,0.630042,0.650133,5,2,21,234,249.269231,3,3,0,0,0,58329,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.133333,0.0,0.0,0.0,0.0,0.966667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,39611403,141,96,84338,0.0,1,0,0,0.057,52550.0,0.680851,0.0,0.269504,0.730496,167,154,233252,0,0,0.114,16,52550.0,0.874251,0.0,0.0,1.0,9,537141,0.51485,0.51485,0.51485,2,1,3,319,300.952978,4,3,0,0,0,96004,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.716312,0.070922,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,43087270,10,6,3108779,0.0,1,0,0,0.1,46302.0,0.8,0.0,0.3,0.7,316,141,293409,0,0,0.066,16,52550.0,0.661392,0.0,0.0,1.0,2,8539,0.618574,0.618574,0.618574,11,4,1,58,3696.551724,3,3,0,0,0,214400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,1
4,39521403,13,5,490886,0.0,1,0,0,0.231,,0.923077,0.0,0.076923,0.923077,5,9,265952,0,0,0.2,16,67175.0,0.2,0.0,0.0,1.0,1,13565,0.543236,0.543236,0.543236,1,1,1,338,917.414201,3,3,0,0,0,310086,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.923077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


### Dataset naming
- ...-v2.csv - initial dataset
- ...-v2.1.csv - dataset with column names
- ...-v2.2.csv - dataset with grouped data after shortening OKPD
- ...-v2.3.csv - one contract is encoded by one observation (one row)
- ...-v2.4.csv - dataset after basic feature engineering