In [1]:
import pandas as pd
from gpalib import preprocessing
from gpalib import analysis

In [2]:
"""Display settings"""
pd.options.display.float_format = '{:,.3f}'.format
pd.options.display.max_columns = 300

### Dataset naming
- `v2` - initial dataset
- `v2.1` - dataset with column names
- `v2.2` - dataset with grouped data after shortening OKPD
- `v2.3` - one contract is encoded by one observation (one row)
- `v2.4` - dataset after basic feature engineering
- `v2.5` - dataset with preprocessing based on EDA (exporatory-data-analysis.ipynb)

### Step 1. Adding column names

In [3]:
%%time
# Reading data
data = pd.read_csv('../data/russia-16-19-v2.csv', sep=';', header=None)

# Adding columns names from another dataset
data.columns = list(pd.read_csv('../data/kakiningrad-16-19.csv', sep=';').columns) + ['org_ter', 'sup_ter']

# Saving new dataset
data.to_csv('../data/russia-16-19-v2.1.csv', index=False)

CPU times: user 28.3 s, sys: 1.05 s, total: 29.3 s
Wall time: 30 s


In [4]:
print(data.shape)
data.head()

(572066, 45)


Unnamed: 0,valID,cntrID,supID,orgID,okpdID,cntr_reg_num,sup_cntr_num,sup_running_cntr_num,sup_good_cntr_num,sup_fed_cntr_num,sup_sub_cntr_num,sup_mun_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_1s_sev,sup_1s_org_sev,sup_okpd_cntr_num,sup_sim_price_share,org_cntr_num,org_running_cntr_num,org_good_cntr_num,org_fed_cntr_num,org_sub_cntr_num,org_mun_cntr_num,org_cntr_avg_price,org_1s_sev,org_1s_sup_sev,org_sim_price_share,cntr_num_together,org_type,okpd_cntr_num,okpd_good_cntr_num,okpd,price,pmp,cntr_lvl,sign_date,exec_date,purch_type,price_higher_pmp,price_too_low,cntr_result,org_ter,sup_ter
0,58119014,39509208,10474056,370745,157035,2644600050617000034,339,66,339,1,338,0,262835,0,1,0,0,134,239,269,164,265,0,269,0,123302,0,0,149,2,16,1548.0,2295,620111000,79577,214550,2,20170203,20170630,1,0,1,0,67600.0,
1,58803716,40604578,10399429,430466,151412,2622903819617000075,47,25,45,5,42,0,188341,0,1,0,0,3,106,217,175,183,0,217,0,447034,0,0,97,1,16,26670.0,41325,202014000,435176,486235,2,20170427,20180131,3,0,0,1,52109.0,
2,58862275,39272060,10242364,414245,155877,3463002813017000005,547,503,196,84,120,343,139761,0,1,0,0,26,113,19,14,3,0,0,19,147610,0,0,105,1,16,3630.0,6679,360020110,24686,41343,3,20170120,20170228,6,0,1,1,52598.0,
3,58862275,39272060,10242364,414245,155879,3463002813017000005,547,503,196,84,120,343,139761,0,1,0,0,71,113,19,14,3,0,0,19,147610,0,0,105,1,16,20298.0,37070,360020130,24686,41343,3,20170120,20170228,6,0,1,1,52598.0,
4,59321033,39527446,11038878,465551,156057,1254001576717000022,1,0,0,1,0,0,123280928,0,1,0,0,1,1,215,81,210,215,0,0,1152392,0,0,5,1,30,5030.0,8097,412040000,123280928,123280928,1,20170207,20171231,3,0,0,1,70162.0,


### Step 2. Aggregating data for shortened OKPD

In [5]:
OKPD_SYM_TO_SAVE = 2
OKPD_COLUMN_NAME = 'okpd{}'.format(OKPD_SYM_TO_SAVE)

In [6]:
%%time
data = pd.read_csv('../data/russia-16-19-v2.1.csv')
data = preprocessing.aggregate_data_by_shortened_okpd(data, okpd_sym=OKPD_SYM_TO_SAVE)
data.to_csv('../data/russia-16-19-v2.2.csv', index=False)

New column `okpd2` is added
Data for columns `okpd_cntr_num` and `okpd_good_cntr_num` is aggregated
Data for column `sup_okpd_cntr_num` is aggregated

CPU times: user 52.8 s, sys: 2.9 s, total: 55.7 s
Wall time: 48 s


In [7]:
print(data.shape)
data.head()

(572066, 46)


Unnamed: 0,valID,cntrID,supID,orgID,okpdID,cntr_reg_num,sup_cntr_num,sup_running_cntr_num,sup_good_cntr_num,sup_fed_cntr_num,sup_sub_cntr_num,sup_mun_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_1s_sev,sup_1s_org_sev,sup_okpd_cntr_num,sup_sim_price_share,org_cntr_num,org_running_cntr_num,org_good_cntr_num,org_fed_cntr_num,org_sub_cntr_num,org_mun_cntr_num,org_cntr_avg_price,org_1s_sev,org_1s_sup_sev,org_sim_price_share,cntr_num_together,org_type,okpd_cntr_num,okpd_good_cntr_num,okpd,price,pmp,cntr_lvl,sign_date,exec_date,purch_type,price_higher_pmp,price_too_low,cntr_result,org_ter,sup_ter,okpd2
0,58119014,39509208,10474056,370745,157035,2644600050617000034,339,66,339,1,338,0,262835,0,1,0,0,216,239,269,164,265,0,269,0,123302,0,0,149,2,16,20014.0,31162,620111000,79577,214550,2,20170203,20170630,1,0,1,0,67600.0,,62
1,58803716,40604578,10399429,430466,151412,2622903819617000075,47,25,45,5,42,0,188341,0,1,0,0,20,106,217,175,183,0,217,0,447034,0,0,97,1,16,400901.0,634822,202014000,435176,486235,2,20170427,20180131,3,0,0,1,52109.0,,20
2,58862275,39272060,10242364,414245,155877,3463002813017000005,547,503,196,84,120,343,139761,0,1,0,0,167,113,19,14,3,0,0,19,147610,0,0,105,1,16,47875.0,85361,360020110,24686,41343,3,20170120,20170228,6,0,1,1,52598.0,,36
3,58862275,39272060,10242364,414245,155879,3463002813017000005,547,503,196,84,120,343,139761,0,1,0,0,167,113,19,14,3,0,0,19,147610,0,0,105,1,16,47875.0,85361,360020130,24686,41343,3,20170120,20170228,6,0,1,1,52598.0,,36
4,59321033,39527446,11038878,465551,156057,1254001576717000022,1,0,0,1,0,0,123280928,0,1,0,0,1,1,215,81,210,215,0,0,1152392,0,0,5,1,30,11457.0,18634,412040000,123280928,123280928,1,20170207,20171231,3,0,0,1,70162.0,,41


### Step 3. Flattening data: one contract = one row

In [8]:
%%time
data = pd.read_csv('../data/russia-16-19-v2.2.csv')
data = preprocessing.flatten_data(data, OKPD_COLUMN_NAME, debug=True)
data.to_csv('../data/russia-16-19-v2.3.csv', index=False)

New variable `okpd_num` was created
Dummy variables for `okpd2` and `sup_okpd_contract_share` were created
Data was flattened: one contract = one row
New variables (min, mean, max) instead of `okpd_good_cntr_share` were created
`socs_` variables were updated

CPU times: user 3min 2s, sys: 10.3 s, total: 3min 12s
Wall time: 3min 12s


In [9]:
print(data.shape)
data.head()

(308273, 213)


Unnamed: 0,valID,cntrID,supID,orgID,okpdID,cntr_reg_num,sup_cntr_num,sup_running_cntr_num,sup_good_cntr_num,sup_fed_cntr_num,sup_sub_cntr_num,sup_mun_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_1s_sev,sup_1s_org_sev,sup_okpd_cntr_num,sup_sim_price_share,org_cntr_num,org_running_cntr_num,org_good_cntr_num,org_fed_cntr_num,org_sub_cntr_num,org_mun_cntr_num,org_cntr_avg_price,org_1s_sev,org_1s_sup_sev,org_sim_price_share,cntr_num_together,org_type,okpd_good_cntr_num,okpd_cntr_num,okpd,price,pmp,cntr_lvl,sign_date,exec_date,purch_type,price_higher_pmp,price_too_low,cntr_result,org_ter,sup_ter,okpd2,sup_okpd_cntr_share,okpd_num,okpd2_0,okpd2_10,okpd2_11,okpd2_12,okpd2_13,okpd2_14,okpd2_15,okpd2_16,okpd2_17,okpd2_18,okpd2_19,okpd2_20,okpd2_21,okpd2_22,okpd2_23,okpd2_24,okpd2_25,okpd2_26,okpd2_27,okpd2_28,okpd2_29,okpd2_30,okpd2_31,okpd2_32,okpd2_33,okpd2_35,okpd2_36,okpd2_37,okpd2_38,okpd2_39,okpd2_41,okpd2_42,okpd2_43,okpd2_45,okpd2_46,okpd2_47,okpd2_49,okpd2_50,okpd2_51,okpd2_52,okpd2_53,okpd2_55,okpd2_56,okpd2_58,okpd2_59,okpd2_60,okpd2_61,okpd2_62,okpd2_63,okpd2_64,okpd2_65,okpd2_66,okpd2_68,okpd2_69,okpd2_70,okpd2_71,okpd2_72,okpd2_73,okpd2_74,okpd2_75,okpd2_77,okpd2_78,okpd2_79,okpd2_80,okpd2_81,okpd2_82,okpd2_84,okpd2_85,okpd2_86,okpd2_87,okpd2_88,okpd2_89,okpd2_90,okpd2_91,okpd2_93,okpd2_94,okpd2_95,okpd2_96,okpd2_97,okpd2_98,okpd2_99,socs_0,socs_10,socs_11,socs_12,socs_13,socs_14,socs_15,socs_16,socs_17,socs_18,socs_19,socs_20,socs_21,socs_22,socs_23,socs_24,socs_25,socs_26,socs_27,socs_28,socs_29,socs_30,socs_31,socs_32,socs_33,socs_35,socs_36,socs_37,socs_38,socs_39,socs_41,socs_42,socs_43,socs_45,socs_46,socs_47,socs_49,socs_50,socs_51,socs_52,socs_53,socs_55,socs_56,socs_58,socs_59,socs_60,socs_61,socs_62,socs_63,socs_64,socs_65,socs_66,socs_68,socs_69,socs_70,socs_71,socs_72,socs_73,socs_74,socs_75,socs_77,socs_78,socs_79,socs_80,socs_81,socs_82,socs_84,socs_85,socs_86,socs_87,socs_88,socs_89,socs_90,socs_91,socs_93,socs_94,socs_95,socs_96,socs_97,socs_98,socs_99,okpd_good_share_min,okpd_good_share_mean,okpd_good_share_max
0,381534324,41348717,11145067,415619,151867,1461300463617000025,11,7,11,9,2,0,38392,0,1,0,0,1,182,117,13,117,117,0,0,914575,0,0,12,1,18,244018.0,388994,221950000,61097,0,1,20170717,20171201,4,0,0,0,52598.0,,22,0.091,9,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091,0.0,0.0,0.0,0.0,2.636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.627,0.634,0.641
1,348786979,40390205,11108197,423787,157580,2770727805017000068,2,3,2,0,2,0,13290,0,1,0,0,2,5,221,208,220,0,221,0,661040,0,0,59,1,16,42465.0,69131,854193000,4500,0,2,20170329,20170430,6,0,0,0,46302.0,46302.0,85,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.614,0.614,0.614
2,381081415,39521901,7728660,603873,151741,2245600229417000161,6535,3301,6366,130,6403,2,271847,14,0,0,0,2355,64,584,492,580,0,584,0,157481,0,0,113,147,16,645600.0,1093576,212010191,40260,0,2,20170207,20180331,3,0,0,0,71678.0,,21,0.36,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.59,0.59,0.59
3,378353115,44319014,11439915,352966,153275,1772905090118000163,2,0,2,1,0,1,55380,0,1,0,0,2,5,20,17,20,20,0,0,153836,0,0,25,1,16,171891.0,270539,262021120,45615,0,1,20180330,20180430,3,0,0,0,52596.0,52571.0,26,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.635,0.635,0.635
4,367692852,39052240,9488348,292228,151687,1501000335217000045,21,4,13,8,13,0,398882,0,1,0,0,21,286,222,96,94,222,0,0,455357,0,0,9,4,16,645600.0,1093576,211060196,670486,0,1,20170112,20171231,3,0,0,1,52601.0,46302.0,21,1.0,14,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.59,0.59,0.59


### Step 4. Adding some features and deleting unneeded

In [10]:
%%time
data = pd.read_csv('../data/russia-16-19-v2.3.csv')
data = preprocessing.basic_feature_engineering(data, OKPD_COLUMN_NAME)
data.to_csv('../data/russia-16-19-v2.4.csv', index=False)

4 new variables were created: `plan_cntr_len`, `day_price`, `sign_month`, `sign_quarter`. Data shape: (308273, 217)
8 new variables were created: `sup_good_cntr_share`, `sup_fed_cntr_share`, `sup_sub_cntr_share` and `sup_mun_cntr_share` and the same for customer. Data shape: (308273, 225)
20 variables were deleted: `valID`, `supID`, `orgID`, `okpdID`, `cntr_reg_num`, `okpd`, `okpd2`, `sup_good_cntr_num`, `sup_fed_cntr_num`, `sup_sub_cntr_num`, `sup_mun_cntr_num`, `sup_okpd_cntr_num`, `sup_okpd_cntr_share`, `org_good_cntr_num`, `org_fed_cntr_num`, `org_sub_cntr_num`, `org_mun_cntr_num`, `okpd_good_cntr_num`, `sign_date`, `exec_date`. Data shape: (308273, 205)
Columns were reordered

CPU times: user 1min 9s, sys: 5.97 s, total: 1min 15s
Wall time: 1min 13s


In [11]:
print(data.shape)
data.head()

(308273, 205)


Unnamed: 0,cntrID,sup_cntr_num,sup_running_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_1s_sev,sup_1s_org_sev,sup_sim_price_share,sup_ter,sup_good_cntr_share,sup_fed_cntr_share,sup_sub_cntr_share,sup_mun_cntr_share,org_cntr_num,org_running_cntr_num,org_cntr_avg_price,org_1s_sev,org_1s_sup_sev,org_sim_price_share,org_type,org_ter,org_good_cntr_share,org_fed_cntr_share,org_sub_cntr_share,org_mun_cntr_share,cntr_num_together,okpd_cntr_num,okpd_good_share_min,okpd_good_share_mean,okpd_good_share_max,sign_month,sign_quarter,cntr_okpd_num,plan_cntr_len,day_price,purch_type,cntr_lvl,price_higher_pmp,price_too_low,pmp,price,okpd2_0,okpd2_10,okpd2_11,okpd2_12,okpd2_13,okpd2_14,okpd2_15,okpd2_16,okpd2_17,okpd2_18,okpd2_19,okpd2_20,okpd2_21,okpd2_22,okpd2_23,okpd2_24,okpd2_25,okpd2_26,okpd2_27,okpd2_28,okpd2_29,okpd2_30,okpd2_31,okpd2_32,okpd2_33,okpd2_35,okpd2_36,okpd2_37,okpd2_38,okpd2_39,okpd2_41,okpd2_42,okpd2_43,okpd2_45,okpd2_46,okpd2_47,okpd2_49,okpd2_50,okpd2_51,okpd2_52,okpd2_53,okpd2_55,okpd2_56,okpd2_58,okpd2_59,okpd2_60,okpd2_61,okpd2_62,okpd2_63,okpd2_64,okpd2_65,okpd2_66,okpd2_68,okpd2_69,okpd2_70,okpd2_71,okpd2_72,okpd2_73,okpd2_74,okpd2_75,okpd2_77,okpd2_78,okpd2_79,okpd2_80,okpd2_81,okpd2_82,okpd2_84,okpd2_85,okpd2_86,okpd2_87,okpd2_88,okpd2_89,okpd2_90,okpd2_91,okpd2_93,okpd2_94,okpd2_95,okpd2_96,okpd2_97,okpd2_98,okpd2_99,socs_0,socs_10,socs_11,socs_12,socs_13,socs_14,socs_15,socs_16,socs_17,socs_18,socs_19,socs_20,socs_21,socs_22,socs_23,socs_24,socs_25,socs_26,socs_27,socs_28,socs_29,socs_30,socs_31,socs_32,socs_33,socs_35,socs_36,socs_37,socs_38,socs_39,socs_41,socs_42,socs_43,socs_45,socs_46,socs_47,socs_49,socs_50,socs_51,socs_52,socs_53,socs_55,socs_56,socs_58,socs_59,socs_60,socs_61,socs_62,socs_63,socs_64,socs_65,socs_66,socs_68,socs_69,socs_70,socs_71,socs_72,socs_73,socs_74,socs_75,socs_77,socs_78,socs_79,socs_80,socs_81,socs_82,socs_84,socs_85,socs_86,socs_87,socs_88,socs_89,socs_90,socs_91,socs_93,socs_94,socs_95,socs_96,socs_97,socs_98,socs_99,cntr_result
0,41348717,11,7,38392,0.0,1,0,0,0.182,,1.0,0.818,0.182,0.0,117,13,914575,0,0,0.12,18,52598.0,1.0,1.0,0.0,0.0,1,388994,0.627,0.634,0.641,7,3,9,137,445.964,4,1,0,0,0,61097,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091,0.0,0.0,0.0,0.0,2.636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,40390205,2,3,13290,0.0,1,0,0,0.5,46302.0,1.0,0.0,1.0,0.0,221,208,661040,0,0,0.059,16,46302.0,0.995,0.0,1.0,0.0,1,69131,0.614,0.614,0.614,3,1,1,32,140.625,6,2,0,0,0,4500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,39521901,6535,3301,271847,0.014,0,0,0,0.064,,0.974,0.02,0.98,0.0,584,492,157481,0,0,0.113,16,71678.0,0.993,0.0,1.0,0.0,147,1093576,0.59,0.59,0.59,2,1,1,417,96.547,3,2,0,0,0,40260,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,44319014,2,0,55380,0.0,1,0,0,0.5,52571.0,1.0,0.5,0.0,0.5,20,17,153836,0,0,0.25,16,52596.0,1.0,1.0,0.0,0.0,1,270539,0.635,0.635,0.635,3,1,1,31,1471.452,3,1,0,0,0,45615,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,39052240,21,4,398882,0.0,1,0,0,0.286,46302.0,0.619,0.381,0.619,0.0,222,96,455357,0,0,0.09,16,52601.0,0.423,1.0,0.0,0.0,4,1093576,0.59,0.59,0.59,1,1,14,353,1899.394,3,1,0,0,0,670486,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


### Step 5. Preprocessing based on EDA
- Preprocessing NaN values
- Dropping useless variables
- Dropping correlating variables
- Applying logarithmic transformation for quantitative variables

In [12]:
%%time
data = pd.read_csv('../data/russia-16-19-v2.4.csv')
num_var01, num_var, cat_bin_var, cat_var = analysis.group_variables(data)
data = preprocessing.preprocess_data_after_eda(data, num_var01, num_var, cat_bin_var, cat_var)
data.to_csv('../data/russia-16-19-v2.4.csv', index=False)

CPU times: user 1min 2s, sys: 5.33 s, total: 1min 7s
Wall time: 1min 9s


In [13]:
print(data.shape)
data.head()

(308273, 187)


Unnamed: 0,sup_running_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_sim_price_share,sup_good_cntr_share,sup_fed_cntr_share,sup_sub_cntr_share,sup_mun_cntr_share,org_cntr_num,org_cntr_avg_price,org_sim_price_share,org_type,org_ter,org_good_cntr_share,cntr_num_together,okpd_cntr_num,okpd_good_share_min,sign_month,cntr_okpd_num,plan_cntr_len,day_price,purch_type,cntr_lvl,okpd2_0,okpd2_10,okpd2_11,okpd2_12,okpd2_13,okpd2_14,okpd2_15,okpd2_16,okpd2_17,okpd2_18,okpd2_19,okpd2_20,okpd2_21,okpd2_22,okpd2_23,okpd2_24,okpd2_25,okpd2_26,okpd2_27,okpd2_28,okpd2_29,okpd2_30,okpd2_31,okpd2_32,okpd2_33,okpd2_35,okpd2_36,okpd2_37,okpd2_38,okpd2_39,okpd2_41,okpd2_42,okpd2_43,okpd2_45,okpd2_46,okpd2_47,okpd2_49,okpd2_50,okpd2_51,okpd2_52,okpd2_53,okpd2_55,okpd2_56,okpd2_58,okpd2_59,okpd2_60,okpd2_61,okpd2_62,okpd2_63,okpd2_64,okpd2_65,okpd2_66,okpd2_68,okpd2_69,okpd2_70,okpd2_71,okpd2_72,okpd2_73,okpd2_74,okpd2_75,okpd2_77,okpd2_78,okpd2_79,okpd2_80,okpd2_81,okpd2_82,okpd2_84,okpd2_85,okpd2_86,okpd2_87,okpd2_88,okpd2_89,okpd2_90,okpd2_91,okpd2_93,okpd2_94,okpd2_95,okpd2_96,okpd2_97,okpd2_98,okpd2_99,socs_0,socs_10,socs_11,socs_12,socs_13,socs_14,socs_15,socs_16,socs_17,socs_18,socs_19,socs_20,socs_21,socs_22,socs_23,socs_24,socs_25,socs_26,socs_27,socs_28,socs_29,socs_30,socs_31,socs_32,socs_33,socs_35,socs_36,socs_37,socs_38,socs_39,socs_41,socs_42,socs_43,socs_45,socs_46,socs_47,socs_49,socs_50,socs_51,socs_52,socs_53,socs_55,socs_56,socs_58,socs_59,socs_60,socs_61,socs_62,socs_63,socs_64,socs_65,socs_66,socs_68,socs_69,socs_70,socs_71,socs_72,socs_73,socs_74,socs_75,socs_77,socs_78,socs_79,socs_80,socs_81,socs_82,socs_84,socs_85,socs_86,socs_87,socs_88,socs_89,socs_90,socs_91,socs_93,socs_94,socs_95,socs_96,socs_97,socs_98,socs_99,cntr_result
0,1.946,10.556,0.0,1,0.182,1.0,0.818,0.182,0.0,4.762,13.726,0.12,18,52598,1.0,0.0,12.871,0.627,7,2.197,4.92,6.1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091,0.0,0.0,0.0,0.0,2.636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1.099,9.495,0.0,1,0.5,1.0,0.0,1.0,0.0,5.398,13.402,0.059,16,46302,0.995,0.0,11.144,0.614,3,0.0,3.466,4.946,6,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,8.102,12.513,0.014,0,0.064,0.974,0.02,0.98,0.0,6.37,11.967,0.113,16,71678,0.993,4.99,13.905,0.59,2,0.0,6.033,4.57,3,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,10.922,0.0,1,0.5,1.0,0.5,0.0,0.5,2.996,11.944,0.25,16,52596,1.0,0.0,12.508,0.635,3,0.0,3.434,7.294,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.386,12.896,0.0,1,0.286,0.619,0.381,0.619,0.0,5.403,13.029,0.09,16,52601,0.423,1.386,13.905,0.59,1,2.639,5.866,7.549,3,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
