In [1]:
import numpy as np
import pandas as pd
import re
from transliterate import translit
import lightgbm as lgb
from sklearn.model_selection import train_test_split

pd.options.display.float_format = '{:.1f}'.format
pd.set_option('display.max_columns', None)

### Читаем таблицу за 2019 год, фильтруем столбцы, которые мы можем взять из налоговой 
Количество столбцов можно расширить

In [5]:
#2019
df_2019 = pd.read_csv('agents2019.csv', index_col = 'Unnamed: 0')


print(df_2019.columns)

Index(['Наименование ДП', 'Макс. ПДЗ за 2019 год, дней',
       'Сред. ПДЗ за 2019 год, дней',
       'Кол-во просрочек свыше 5-ти дней за 2019 год, шт.',
       'Общая сумма ПДЗ свыше 5-ти дней за 2019 год, руб.',
       'Кол-во раз ПДЗ за 2019 год, шт.', '2016, Нематериальные активы, RUB',
       '2017, Нематериальные активы, RUB', '2018, Нематериальные активы, RUB',
       '2016, Основные средства , RUB', '2017, Основные средства , RUB',
       '2018, Основные средства , RUB', '2016, Внеоборотные активы, RUB',
       '2017, Внеоборотные активы, RUB', '2018, Внеоборотные активы, RUB',
       '2016, Дебиторская задолженность, RUB',
       '2017, Дебиторская задолженность, RUB',
       '2018, Дебиторская задолженность, RUB', '2016, Оборотные активы, RUB',
       '2017, Оборотные активы, RUB', '2018, Оборотные активы, RUB',
       '2016, Уставный капитал , RUB', '2017, Уставный капитал , RUB',
       '2018, Уставный капитал , RUB', '2016, Капитал и резервы, RUB',
       '2017, Капитал и

In [6]:


renamed_headers = [translit(x,'ru', reversed=True) for x in df_2019.columns]
ren = dict(zip(df_2019.columns, renamed_headers))
df_2019.rename(columns=ren, inplace=True)
df_2019.rename(columns=lambda x: x.replace('.', ''), inplace=True)
df_2019.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x), inplace=True)

df_2019 = df_2019[
    ["SredPDZza2019goddnej", 
     "2018UstavnyjkapitalRUB", 
     "2018DebitorskajazadolzhennostRUB",
    "2018KreditorskajazadolzhennostRUB"]]

df_2019.rename(columns={'SredPDZza2019goddnej': 'ml_score', 
                         '2018UstavnyjkapitalRUB': 'capital',
                        '2018DebitorskajazadolzhennostRUB': 'income',
                        '2018KreditorskajazadolzhennostRUB': 'outcome',}, inplace=True)


display(df_2019)

Unnamed: 0_level_0,ml_score,capital,income,outcome
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,629400000.0,3475461146.5,5476906369.4
1,0.0,1345859.9,156347770.7,66164968.2
2,5.5,19644586.0,228049681.5,123771337.6
3,0.0,20382165.6,5362189808.9,2522167515.9
4,2.0,595541.4,11246701273.9,3855060509.6
...,...,...,...,...
526,0.0,6369.4,25662420.4,49657961.8
527,0.0,240643312.1,82774522.3,137135031.8
528,0.0,0.0,0.0,0.0
529,0.0,7006.4,79496178.3,529396815.3


### Обучаем lgbm для предсказания скора поставщика. В качестве скора сейчас берем средний ПДЗ

In [8]:
X = df_2019.drop(['ml_score'], axis=1)
y = df_2019['ml_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 10,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=lgb_eval,
                callbacks=[lgb.early_stopping(stopping_rounds=20)])


You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 12.4013	valid_0's l2: 2917.25


### Сохраняем LGBM модель

In [9]:
gbm.save_model('ranker.1txt')

<lightgbm.basic.Booster at 0x294a6434a90>

### Используем модель для ранжирования

In [11]:
def rank_companies(pd):
    # As input we shoud get pandas table 
    # with columns 'a', 'b', 'c',
    # which are "Ustavnyj kapital", "Pribyl ubytok do nalogooblozhenija"," Pribyl ubytok ot prodazhi"
    import lightgbm as lgb
    gbm = lgb.Booster(model_file='ranker.1txt')
    y_pred = gbm.predict(pd, num_iteration=gbm.best_iteration)
    pd['ml_score'] = y_pred
    pd.sort_values('ml_score', inplace=True)
    
    return pd


test_table = X_test.copy()

ranked_table = rank_companies(test_table)
print(ranked_table)

              capital        income       outcome  ml_score
Unnamed: 0                                                 
360        63694267.5    39092356.7   125649044.6       6.4
182        27354140.1    72092993.6    95193630.6       6.4
155         2165605.1    20031847.1   154303184.7       6.4
489        31847133.8    21989172.0   143550955.4       6.4
444         3375796.2    80597452.2   146349044.6       6.4
...               ...           ...           ...       ...
30          2325477.7 22601311465.0 10992786624.2       7.5
446            6369.4   799566242.0  1035837579.6       7.5
347          191082.8  1478921656.1  3255275159.2       7.5
89            12738.9  1525047133.8   613883439.5       7.5
10          5335668.8 24092142675.2 25141844586.0       7.5

[133 rows x 4 columns]


## Raspberry Pi ranker version

Используем сильно упрощенную модель - linear regression for raspberrypi deployment

In [13]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import pickle

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred = regr.predict(X_train)
rmse_test = mean_squared_error(y_train, y_pred) ** 0.5
print(f'\nThe RMSE of train prediction is: {rmse_test}')


filename = 'ranker.sav'
pickle.dump(regr, open(filename, 'wb'))


The RMSE of train prediction is: 17.873572318327966


Используем модель для ранжирования на raspberrypi

In [14]:
def rank_companies_pi(pd):
    # As input we shoud get pandas table 
    # with columns 'a', 'b', 'c',
    # which are "Ustavnyj kapital", "Pribyl ubytok do nalogooblozhenija"," Pribyl ubytok ot prodazhi"
    from sklearn import linear_model
    import pickle
    regr = pickle.load(open('ranker.sav', 'rb'))

    y_pred = regr.predict(pd)
    pd['pdz'] = y_pred
    pd.sort_values('pdz', inplace=True)
    
    return pd


test_table = X_test.copy()

ranked_table = rank_companies_pi(test_table)
print(ranked_table)

               capital        income       outcome  pdz
Unnamed: 0                                             
79         979530573.2  4138320382.2   954787898.1  4.9
272        636942675.2    27233121.0    23952866.2  5.0
421             5095.5   137837579.6     8417834.4  5.0
176         44598726.1    96370063.7     7832484.1  5.0
149             6369.4   245917834.4    53379617.8  5.0
...                ...           ...           ...  ...
210          6369426.8  9301021019.1 15657389808.9 22.1
10           5335668.8 24092142675.2 25141844586.0 29.7
84         100792993.6 39324887261.1 32781867515.9 34.8
369         25165605.1 32246499363.1 35160301910.8 40.0
195        229988535.0 34087083439.5 42629309554.1 48.8

[133 rows x 4 columns]
