In [50]:
import pandas as pd
import numpy as np
import networkx as nx
import sqlite3
import dgl
import torch as th

In [3]:
con = sqlite3.connect("data/okved_20230329.sqlite3")
cur = con.cursor()

def read(sql):
    return pd.read_sql(sql, con)

sql = '''
    SELECT * FROM legal_entity
'''

comps = read(sql)

sql = '''
    SELECT * FROM supplier
'''

sups = read(sql)

sql = '''
    SELECT * FROM procurement
'''

proc = read(sql)

sql = '''
    SELECT * FROM product
'''

prod = read(sql)

In [4]:
comps = comps[~comps.address.isna()]
comps['emb'] = np.load('data/emb_np.npy', allow_pickle=True)
comps_clean = comps[~comps['emb'].isna()]

In [None]:
customer_name = []
executor_name = []
customer_emb = []
executor_emb = []
for i in range(len(proc)):
    inns = proc.iloc[i, :][['customer_inn', 'executor_inn']]
    c_inn = inns[0]
    e_inn = inns[1]
    c_info = comps_clean[comps_clean['inn'] == c_inn]
    e_info = comps_clean[comps_clean['inn'] == e_inn]
    if len(c_info) != 0 and\
       len(e_info) != 0:
        customer_name.append(c_info.name.values[0])
        customer_emb.append(c_info.emb.values[0])
        executor_name.append(e_info.name.values[0])
        executor_emb.append(e_info.emb.values[0])
        

In [6]:
from dgl.data.utils import load_graphs
glist, label_dict = load_graphs("data/graph.bin")
g = glist[0]

In [8]:
g.nodes()

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 1

In [9]:
import pickle
filename = 'data/encoder.sav'
le = pickle.load(open(filename, 'rb')) 

In [12]:
names_in_train_graph = le.inverse_transform(g.nodes())

In [16]:
comps_for_pred = comps_clean[~comps_clean['name'].isin(names_in_train_graph)].sample(20)

In [30]:
label_df = pd.DataFrame([np.arange(1000, 1020), comps_for_pred['name']]).T.rename(columns = {0: 'id', 1: 'name'})
label_df

Unnamed: 0,id,name
0,1000,"ООО ""БЕЛДОРСТРОЙ"""
1,1001,"МУНИЦИПАЛЬНОЕ БЮДЖЕТНОЕ\nУЧРЕЖДЕНИЕ ""БЛАГОУСТР..."
2,1002,"ООО ""СК РЕМСТРОЙСЕРВИС"""
3,1003,"ООО ""ОЗТП"""
4,1004,"ООО ""ВМПАВТО"""
5,1005,"ООО ""ЛЕНКОМСТРОЙ"""
6,1006,"ООО ""ГС ГРУПП"""
7,1007,"ООО ""ГД РАША"""
8,1008,"ООО ""ТТЦ ""ФОЛИУМ"""
9,1009,"ООО ""ПК ""СТЕКЛОКОМПОЗИТ"""


In [45]:
label_df.to_csv('data/label_df.csv', index = False)

In [33]:
ids = label_df['id'].values

In [34]:
ids

array([1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010,
       1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019], dtype=object)

In [37]:
embs_c = comps_for_pred['emb'].values

In [42]:
posts = g.edges()[1].unique()

In [43]:
len(posts)

784

In [44]:
posts

tensor([  0,   1,   2,   3,   4,   5,   6,   7,  11,  12,  14,  15,  16,  17,
         18,  19,  21,  22,  24,  26,  27,  28,  29,  31,  32,  33,  34,  35,
         36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  50,
         51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  76,  77,  78,  79,
         80,  81,  82,  83,  84,  85,  87,  88,  89,  90,  91,  92,  93,  94,
         95,  96,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 109, 110,
        111, 112, 113, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
        155, 156, 157, 158, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
        170, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
        185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 1

In [47]:
g.ndata['emb'][posts]

tensor([[7.2311e-03, 2.2477e-03, 1.2390e-02,  ..., 8.7761e-06, 5.1598e-07,
         5.9595e-04],
        [3.3183e-03, 2.1806e-03, 1.2581e-02,  ..., 3.1027e-06, 4.9520e-07,
         7.5370e-07],
        [3.8771e-03, 3.1313e-03, 1.2746e-02,  ..., 6.0926e-06, 1.0632e-06,
         2.3831e-07],
        ...,
        [1.8487e-03, 4.7437e-03, 1.6954e-03,  ..., 1.3278e-08, 5.6658e-09,
         1.2766e-08],
        [2.2966e-03, 2.7134e-03, 4.0731e-03,  ..., 2.5404e-04, 4.7168e-07,
         7.5876e-06],
        [4.8825e-03, 3.9515e-03, 4.3272e-03,  ..., 2.1078e-05, 9.8021e-06,
         4.3235e-05]])

In [93]:
# comps 0-19
# post 20 - len(posts) + 20
comps_id = np.arange(0, 20)
posts_id = np.arange(20, len(posts) + 20)

In [94]:
n_comp = []
for i in comps_id:
    n_comp += [i] * len(posts_id)
n_comp = th.tensor(n_comp)

In [97]:
n_post = th.tensor(np.hstack([posts_id] * len(comps_id)))

In [98]:
t_g = dgl.graph((n_comp, n_post.int()))

In [119]:
embs_data = th.vstack([th.tensor(np.stack(comps_for_pred['emb'].values)), g.ndata['emb'][posts]])
labels_ = th.hstack([th.tensor(ids.astype(int)), posts])

In [121]:
t_g.ndata['emb'] = embs_data
t_g.ndata['label'] = labels_

In [123]:
from dgl.data.utils import save_graphs
save_graphs("data/graph_pred.bin", t_g)

In [124]:
t_g.ndata['label']

tensor([1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011,
        1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019,    0,    1,    2,    3,
           4,    5,    6,    7,   11,   12,   14,   15,   16,   17,   18,   19,
          21,   22,   24,   26,   27,   28,   29,   31,   32,   33,   34,   35,
          36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,
          48,   50,   51,   52,   53,   54,   55,   56,   57,   58,   59,   60,
          61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,
          73,   74,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,
          87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   98,   99,
         100,  101,  102,  103,  104,  105,  106,  107,  109,  110,  111,  112,
         113,  115,  116,  117,  118,  119,  120,  121,  122,  123,  124,  125,
         126,  127,  128,  129,  130,  131,  132,  133,  134,  135,  136,  137,
         138,  139,  140,  142,  143,  1

In [126]:
comps_for_pred.to_csv('data/comps_for_pred.csv')