In [1]:
import numpy as np
import pandas as pd
import re
from transliterate import translit
import lightgbm as lgb
from sklearn.model_selection import train_test_split

pd.options.display.float_format = '{:.1f}'.format
pd.set_option('display.max_columns', None)

### Читаем таблицу за 2019 год, фильтруем столбцы, которые мы можем взять из налоговой 
Количество столбцов можно расширить

In [2]:
#2019
df_2019 = pd.read_csv('agents2019.csv', index_col = 'Unnamed: 0')

renamed_headers = [translit(x,'ru', reversed=True) for x in df_2019.columns]
ren = dict(zip(df_2019.columns, renamed_headers))
df_2019.rename(columns=ren, inplace=True)
df_2019.rename(columns=lambda x: x.replace('.', ''), inplace=True)
df_2019.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x), inplace=True)

df_2019 = df_2019[
    ["SredPDZza2019goddnej", 
     "2018UstavnyjkapitalRUB", 
     "2018PribylubytokdonalogooblozhenijaRUB",
    "2018PribylubytokotprodazhiRUB"]]

df_2019.rename(columns={'SredPDZza2019goddnej': 'pdz', 
                         '2018UstavnyjkapitalRUB': 'a',
                        '2018PribylubytokdonalogooblozhenijaRUB': 'b',
                        '2018PribylubytokotprodazhiRUB': 'c',}, inplace=True)


display(df_2019)

Unnamed: 0_level_0,pdz,a,b,c
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,629400000.0,746182802.5,871619108.3
1,0.0,1345859.9,44800636.9,51357324.8
2,5.5,19644586.0,85529936.3,-92989808.9
3,0.0,20382165.6,6840634394.9,7869977070.1
4,2.0,595541.4,45936061783.4,53075239490.4
...,...,...,...,...
526,0.0,6369.4,6176433.1,6782165.6
527,0.0,240643312.1,80433758.0,64995541.4
528,0.0,0.0,0.0,0.0
529,0.0,7006.4,172095541.4,205289808.9


### Обучаем lgbm для предсказания скора поставщика. В качестве скора сейчас берем средний ПДЗ

In [3]:
X = df_2019.drop(['pdz'], axis=1)
y = df_2019['pdz']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 10,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=lgb_eval,
                callbacks=[lgb.early_stopping(stopping_rounds=20)])


You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[18]	valid_0's l1: 12.2963	valid_0's l2: 2831.37


### Сохраняем LGBM модель

In [4]:
gbm.save_model('ranker.1txt')

<lightgbm.basic.Booster at 0x1c361fcd250>

### Используем модель для ранжирования

In [5]:
def rank_companies(pd):
    # As input we shoud get pandas table 
    # with columns 'a', 'b', 'c',
    # which are "Ustavnyj kapital", "Pribyl ubytok do nalogooblozhenija"," Pribyl ubytok ot prodazhi"
    import lightgbm as lgb
    gbm = lgb.Booster(model_file='ranker.1txt')
    y_pred = gbm.predict(pd, num_iteration=gbm.best_iteration)
    pd['pdz'] = y_pred
    pd.sort_values('pdz', inplace=True)
    
    return pd


test_table = X_test.copy()

ranked_table = rank_companies(test_table)
print(ranked_table)

                      a               b             c  pdz
Unnamed: 0                                                
89              12738.9     123889808.9   275915923.6  3.7
502              6369.4     143459872.6   217403184.7  3.7
72              50955.4      30266242.0    90570700.6  4.1
376        1687910828.0      36182802.5   233578980.9  4.1
15            3143312.1      42890445.9   129052229.3  4.2
...                 ...             ...           ...  ...
167        1913510828.0    -167102547.8    -2213375.8 15.6
78          185414012.7    -222906369.4   105183439.5 15.6
361          98254140.1    -711054140.1  -545214012.7 15.6
360          63694267.5     -41189808.9   -79257324.8 15.6
195         229988535.0 -131630508280.3 76496135668.8 16.4

[133 rows x 4 columns]
