In [150]:
import itertools
import os

os.environ["DGLBACKEND"] = "pytorch"

import dgl
import dgl.data
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F

In [151]:
from dgl.data.utils import load_graphs
glist, label_dict = load_graphs("data/graph.bin")
g = glist[0]

In [152]:
u, v = g.edges()

eids = np.arange(g.num_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.num_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), shape=(g.num_nodes(), g.num_nodes()))
adj_neg = 1 - adj.todense() - np.eye(g.num_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.num_edges())
test_neg_u, test_neg_v = (
    neg_u[neg_eids[:test_size]],
    neg_v[neg_eids[:test_size]],
)
train_neg_u, train_neg_v = (
    neg_u[neg_eids[test_size:]],
    neg_v[neg_eids[test_size:]],
)

In [153]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [154]:
from dgl.nn import SAGEConv
from dgl.nn import GraphConv
from dgl.nn import TAGConv

In [155]:
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, "mean")
        self.conv2 = SAGEConv(h_feats, h_feats, "mean")

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [156]:
class GraphCon(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphCon, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, h_feats, allow_zero_in_degree=True)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [157]:
class TAGCon(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(TAGCon, self).__init__()
        self.conv1 = TAGConv(in_feats, h_feats)
        self.conv2 = TAGConv(h_feats, h_feats)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [158]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.num_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.num_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.num_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.num_nodes())

In [159]:
import dgl.function as fn


class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.apply_edges(fn.u_dot_v("h", "h", "score"))

            return g.edata["score"][:, 0]

In [160]:
model = GraphSAGE(train_g.ndata["emb"].shape[1], 16)
pred = DotPredictor()


def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    )
    return F.binary_cross_entropy_with_logits(scores, labels)


def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    ).numpy()
    return roc_auc_score(labels, scores)

In [161]:

optimizer = torch.optim.Adam(
    itertools.chain(model.parameters(), pred.parameters()), lr=0.01
)


all_logits = []
for e in range(100):

    h = model(train_g, train_g.ndata["emb"])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print("In epoch {}, loss: {}".format(e, loss))

from sklearn.metrics import roc_auc_score

with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print("AUC", compute_auc(pos_score, neg_score))



In epoch 0, loss: 0.7025274038314819
In epoch 5, loss: 0.6891184449195862
In epoch 10, loss: 0.6711958050727844
In epoch 15, loss: 0.6347291469573975
In epoch 20, loss: 0.6017602682113647
In epoch 25, loss: 0.5935764908790588
In epoch 30, loss: 0.5786384344100952
In epoch 35, loss: 0.562750518321991
In epoch 40, loss: 0.5508080124855042
In epoch 45, loss: 0.5407263040542603
In epoch 50, loss: 0.5311369895935059
In epoch 55, loss: 0.5224345326423645
In epoch 60, loss: 0.5143236517906189
In epoch 65, loss: 0.5061624646186829
In epoch 70, loss: 0.4979434013366699
In epoch 75, loss: 0.4902074337005615
In epoch 80, loss: 0.48249560594558716
In epoch 85, loss: 0.47404178977012634
In epoch 90, loss: 0.46579375863075256
In epoch 95, loss: 0.45714643597602844
AUC 0.7216037155931971


In [162]:
model.eval()

GraphSAGE(
  (conv1): SAGEConv(
    (feat_drop): Dropout(p=0.0, inplace=False)
    (fc_neigh): Linear(in_features=2000, out_features=16, bias=False)
    (fc_self): Linear(in_features=2000, out_features=16, bias=True)
  )
  (conv2): SAGEConv(
    (feat_drop): Dropout(p=0.0, inplace=False)
    (fc_neigh): Linear(in_features=16, out_features=16, bias=False)
    (fc_self): Linear(in_features=16, out_features=16, bias=True)
  )
)

In [163]:
model2 = GraphCon(train_g.ndata["emb"].shape[1], 16)

optimizer = torch.optim.Adam(
    itertools.chain(model2.parameters(), pred.parameters()), lr=0.01
)


all_logits = []
for e in range(100):

    h = model2(train_g, train_g.ndata["emb"])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)


    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print("In epoch {}, loss: {}".format(e, loss))


from sklearn.metrics import roc_auc_score

with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print("AUC", compute_auc(pos_score, neg_score))



In epoch 0, loss: 0.6931450963020325
In epoch 5, loss: 0.6886709332466125
In epoch 10, loss: 0.6843879222869873
In epoch 15, loss: 0.6799452304840088
In epoch 20, loss: 0.6751540899276733
In epoch 25, loss: 0.6694871187210083
In epoch 30, loss: 0.6640515923500061
In epoch 35, loss: 0.6610668301582336
In epoch 40, loss: 0.6585425138473511
In epoch 45, loss: 0.6563031077384949
In epoch 50, loss: 0.654927134513855
In epoch 55, loss: 0.6536017656326294
In epoch 60, loss: 0.6524854302406311
In epoch 65, loss: 0.6513131260871887
In epoch 70, loss: 0.6500329375267029
In epoch 75, loss: 0.6486531496047974
In epoch 80, loss: 0.6471423506736755
In epoch 85, loss: 0.6453968286514282
In epoch 90, loss: 0.6433255076408386
In epoch 95, loss: 0.6407319903373718
AUC 0.5450788880540947


In [164]:
model2.eval()

GraphCon(
  (conv1): GraphConv(in=2000, out=16, normalization=both, activation=None)
  (conv2): GraphConv(in=16, out=16, normalization=both, activation=None)
)

In [165]:
model3 = TAGCon(train_g.ndata["emb"].shape[1], 16)

optimizer = torch.optim.Adam(
    itertools.chain(model3.parameters(), pred.parameters()), lr=0.01
)


all_logits = []
for e in range(100):
    
    h = model3(train_g, train_g.ndata["emb"])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)


    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print("In epoch {}, loss: {}".format(e, loss))


from sklearn.metrics import roc_auc_score

with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print("AUC", compute_auc(pos_score, neg_score))


In epoch 0, loss: 0.6966722011566162
In epoch 5, loss: 0.6638185381889343
In epoch 10, loss: 0.6165475845336914
In epoch 15, loss: 0.5894635915756226
In epoch 20, loss: 0.5572670102119446
In epoch 25, loss: 0.545167863368988
In epoch 30, loss: 0.5274288058280945
In epoch 35, loss: 0.5133000612258911
In epoch 40, loss: 0.4966680109500885
In epoch 45, loss: 0.4836604595184326
In epoch 50, loss: 0.4713948965072632
In epoch 55, loss: 0.4578739404678345
In epoch 60, loss: 0.44528281688690186
In epoch 65, loss: 0.4332646429538727
In epoch 70, loss: 0.4220663607120514
In epoch 75, loss: 0.41084206104278564
In epoch 80, loss: 0.3990665078163147
In epoch 85, loss: 0.3871052861213684
In epoch 90, loss: 0.3750072717666626
In epoch 95, loss: 0.36377424001693726
AUC 0.705143091318899


In [166]:
model3.eval()

TAGCon(
  (conv1): TAGConv(
    (lin): Linear(in_features=6000, out_features=16, bias=True)
  )
  (conv2): TAGConv(
    (lin): Linear(in_features=48, out_features=16, bias=True)
  )
)

In [167]:
glist, label_dict = load_graphs("data/graph_pred.bin")
g_pred = glist[0]

In [168]:
g_pred

Graph(num_nodes=804, num_edges=15680,
      ndata_schemes={'emb': Scheme(shape=(2000,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})

In [186]:
h = model(g_pred, g_pred.ndata["emb"])
p = pred(g_pred, h)

In [187]:
p.shape

torch.Size([15680])

In [188]:
probs = torch.sigmoid(p)

In [189]:
probs = probs.reshape(20, -1)

In [190]:
edge_ids = probs.argmax(dim=1)

In [191]:
comp_prob_post = []

In [192]:
emb_post = []

In [193]:
for i in range(len(edge_ids)):
    comp_label = g_pred.ndata['label'][i].item()
    post_id = g_pred.edges()[1][edge_ids[i]].item()
    post_label = g_pred.ndata['label'][post_id].item()
    porb = probs[i][edge_ids[i]].item()
    cpp = [comp_label, porb, post_label]
    comp_prob_post.append(cpp)
    emb_post.append(g.ndata['emb'][post_id].numpy())
comp_prob_post = np.array(comp_prob_post)

In [194]:
comp_prob_post

array([[1.00000000e+03, 9.64171469e-01, 4.91000000e+02],
       [1.00100000e+03, 9.64041710e-01, 6.01000000e+02],
       [1.00200000e+03, 8.67682517e-01, 4.91000000e+02],
       [1.00300000e+03, 9.62050617e-01, 1.95000000e+02],
       [1.00400000e+03, 7.27043450e-01, 1.46000000e+02],
       [1.00500000e+03, 7.11575150e-01, 4.50000000e+02],
       [1.00600000e+03, 9.51568127e-01, 4.91000000e+02],
       [1.00700000e+03, 8.44812691e-01, 4.91000000e+02],
       [1.00800000e+03, 7.78629184e-01, 6.17000000e+02],
       [1.00900000e+03, 9.81292665e-01, 6.01000000e+02],
       [1.01000000e+03, 8.25508296e-01, 4.91000000e+02],
       [1.01100000e+03, 9.45416510e-01, 1.95000000e+02],
       [1.01200000e+03, 9.63764906e-01, 4.91000000e+02],
       [1.01300000e+03, 9.09833670e-01, 4.91000000e+02],
       [1.01400000e+03, 9.46924806e-01, 4.91000000e+02],
       [1.01500000e+03, 9.62319434e-01, 1.95000000e+02],
       [1.01600000e+03, 9.55627382e-01, 4.91000000e+02],
       [1.01700000e+03, 6.99023

In [195]:
import pandas as pd

In [196]:
label_df = pd.read_csv('data/label_df.csv')

In [197]:
label_df

Unnamed: 0,id,name
0,1000,"ООО ""БЕЛДОРСТРОЙ"""
1,1001,"МУНИЦИПАЛЬНОЕ БЮДЖЕТНОЕ\nУЧРЕЖДЕНИЕ ""БЛАГОУСТР..."
2,1002,"ООО ""СК РЕМСТРОЙСЕРВИС"""
3,1003,"ООО ""ОЗТП"""
4,1004,"ООО ""ВМПАВТО"""
5,1005,"ООО ""ЛЕНКОМСТРОЙ"""
6,1006,"ООО ""ГС ГРУПП"""
7,1007,"ООО ""ГД РАША"""
8,1008,"ООО ""ТТЦ ""ФОЛИУМ"""
9,1009,"ООО ""ПК ""СТЕКЛОКОМПОЗИТ"""


In [198]:
import pickle
filename = 'data/encoder.sav'
le = pickle.load(open(filename, 'rb')) 

In [199]:
comp_name = label_df['name'].values
post_name = le.inverse_transform(comp_prob_post[:,2].astype(int))

In [200]:
comp_name

array(['ООО "БЕЛДОРСТРОЙ"',
       'МУНИЦИПАЛЬНОЕ БЮДЖЕТНОЕ\nУЧРЕЖДЕНИЕ "БЛАГОУСТРОЙСТВО"',
       'ООО "СК РЕМСТРОЙСЕРВИС"', 'ООО "ОЗТП"', 'ООО "ВМПАВТО"',
       'ООО "ЛЕНКОМСТРОЙ"', 'ООО "ГС ГРУПП"', 'ООО "ГД РАША"',
       'ООО "ТТЦ "ФОЛИУМ"', 'ООО "ПК "СТЕКЛОКОМПОЗИТ"',
       'ОАНО ВО "МОСТЕХ"', 'ООО "АГРО ИНВЕСТ"',
       'ОБЩЕСТВО С ОГРАНИЧЕННОЙ\nОТВЕТСТВЕННОСТЬЮ "АСПЕКТ"',
       'ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ БЮДЖЕТНОЕ УЧРЕЖДЕНИЕ "ПОЛИКЛИНИКА №1" УПРАВЛЕНИЯ ДЕЛАМИ ПРЕЗИДЕНТА РОССИЙСКОЙ ФЕДЕРАЦИИ',
       'АО "ГЕНЕРИУМ"', 'ООО "СОНЗ РУС"',
       'ОБЩЕСТВО С ОГРАНИЧЕННОЙ\nОТВЕТСТВЕННОСТЬЮ "ИНТАКА"', 'ООО "РПС"',
       'ООО "ТАЛАЙ"', 'ООО "КСК МК"'], dtype=object)

In [201]:
post_name

array(['ООО "БАРИОН"', 'ООО "М-ТРАСТ"', 'ООО "БАРИОН"', 'АО "ТАЙФУН"',
       'АО "НПО СТЕКЛОПЛАСТИК"',
       'ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ "ЖИЛКОМСЕРВИС Г.ПЕТРОДВОРЦА"',
       'ООО "БАРИОН"', 'ООО "БАРИОН"', 'ООО "МИНИМАКС"', 'ООО "М-ТРАСТ"',
       'ООО "БАРИОН"', 'АО "ТАЙФУН"', 'ООО "БАРИОН"', 'ООО "БАРИОН"',
       'ООО "БАРИОН"', 'АО "ТАЙФУН"', 'ООО "БАРИОН"', 'АО ОЭС',
       'ООО "БАРИОН"', 'ООО "БАРИОН"'], dtype='<U256')

In [204]:
res_df = pd.DataFrame([comp_name, post_name, comp_prob_post[:, 1]]).T\
        .rename(columns = {0: 'Заказчик', 1: 'Исполнитель', 2: 'Вероятность'})
res_df

Unnamed: 0,Заказчик,Исполнитель,Вероятность
0,"ООО ""БЕЛДОРСТРОЙ""","ООО ""БАРИОН""",0.964171
1,"МУНИЦИПАЛЬНОЕ БЮДЖЕТНОЕ\nУЧРЕЖДЕНИЕ ""БЛАГОУСТР...","ООО ""М-ТРАСТ""",0.964042
2,"ООО ""СК РЕМСТРОЙСЕРВИС""","ООО ""БАРИОН""",0.867683
3,"ООО ""ОЗТП""","АО ""ТАЙФУН""",0.962051
4,"ООО ""ВМПАВТО""","АО ""НПО СТЕКЛОПЛАСТИК""",0.727043
5,"ООО ""ЛЕНКОМСТРОЙ""","ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЖИЛК...",0.711575
6,"ООО ""ГС ГРУПП""","ООО ""БАРИОН""",0.951568
7,"ООО ""ГД РАША""","ООО ""БАРИОН""",0.844813
8,"ООО ""ТТЦ ""ФОЛИУМ""","ООО ""МИНИМАКС""",0.778629
9,"ООО ""ПК ""СТЕКЛОКОМПОЗИТ""","ООО ""М-ТРАСТ""",0.981293


In [380]:
torch.save(model, 'data/model')

In [381]:
torch.load('data/model')

GraphSAGE(
  (conv1): SAGEConv(
    (feat_drop): Dropout(p=0.0, inplace=False)
    (fc_neigh): Linear(in_features=2000, out_features=16, bias=False)
    (fc_self): Linear(in_features=2000, out_features=16, bias=True)
  )
  (conv2): SAGEConv(
    (feat_drop): Dropout(p=0.0, inplace=False)
    (fc_neigh): Linear(in_features=16, out_features=16, bias=False)
    (fc_self): Linear(in_features=16, out_features=16, bias=True)
  )
)