## Графовое представление данных

In [10]:
import dgl
from dgl.data import DGLDataset
import torch
import pandas as pd

In [11]:
class OneMetricDataset(DGLDataset):
    def __init__(self, metric):
        if isinstance(metric, str):
            super().__init__(name=metric + ' dataset')
            self.metric = metric
        else:
            raise TypeError('"metric" should be a string with metric name')

    def process(self):
        edges = pd.read_csv('edges_ltv.csv')
        properties = pd.read_csv('properties_ltv.csv')
        one_hot_enc = pd.read_csv('one_hot_ltv.csv')

        self.graphs = []
        self.labels = []

    
        label_dict = {}
        num_nodes_dict = {}
        for _, row in properties.iterrows():
            label_dict[row["graph_id"]] = row["label"]
            num_nodes_dict[row["graph_id"]] = row["num_nodes"]

        edges_group = edges.groupby("graph_id")

        # For each graph ID...
        for graph_id in edges_group.groups:
            edges_of_id = edges_group.get_group(graph_id)
            src = edges_of_id["src"].to_numpy()
            dst = edges_of_id["dst"].to_numpy()
            num_nodes = num_nodes_dict[graph_id]
            label = label_dict[graph_id]

            g = dgl.graph((src, dst), num_nodes=num_nodes)

            ohe_of_id = one_hot_enc[one_hot_enc['graph_id'] == graph_id]

            ohe_data = []
            for _, ohe_row in ohe_of_id.iterrows():
                ohe_string = ohe_row['one_hot_vector']
                ohe_list = [float(num) for num in ohe_string.split(',')]
                ohe_data.append(ohe_list)
            ohe_tensor = torch.tensor(ohe_data, dtype=torch.float32)
            g.ndata['OHE'] = ohe_tensor
            g = dgl.add_self_loop(g)
            
            self.graphs.append(g)
            self.labels.append(label)

        self.labels = torch.LongTensor(self.labels)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.graphs)

In [12]:
dataset = OneMetricDataset('LTV')
graph, label = dataset[0]
print(graph, label)

Graph(num_nodes=352, num_edges=735,
      ndata_schemes={'OHE': Scheme(shape=(352,), dtype=torch.float32)}
      edata_schemes={}) tensor(0)


### Следующие задачи
1. Как задать тензор для каждого графа? Насколько я понимаю, с помощью тензора задаются фичи в узлах, впоследствии я задам эти фичи с помощью Word2Vec для каждого названия таблицы и колонки (+)
2. Далее генерируем схемы базы данных, преобразуем их в занумерованные графы, фичи узлов -- названия колонок и таблиц, закодированные с помощью Word2Vec (+)
3. Как обучать модель? (+)
3. Обучить несколько моделей и сравнить результаты на этих данных 
Всё вышеперечисленное будет работать для одной метрики, такую последовательность надо проделать для всех метрик

In [14]:
from dgl.dataloading import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler

num_examples = len(dataset)
num_train = int(num_examples * 0.8)

train_sampler = SubsetRandomSampler(torch.arange(num_train))
test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))

train_dataloader = GraphDataLoader(
    dataset, sampler=train_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    dataset, sampler=test_sampler, batch_size=5, drop_last=False)

In [15]:
it = iter(train_dataloader)
batch = next(it)
print(batch)

[Graph(num_nodes=1474, num_edges=3040,
      ndata_schemes={'OHE': Scheme(shape=(352,), dtype=torch.float32)}
      edata_schemes={}), tensor([0, 1, 1, 1, 1])]


In [16]:
batched_graph, labels = batch
print('Number of nodes for graphs in the batch:', batched_graph.batch_num_nodes())
print('Number of edges for graphs in the batch:', batched_graph.batch_num_edges())

graphs = dgl.unbatch(batched_graph)
print('The original graphs in the minibatch:')
print(graphs)

Number of nodes for each graph element in the batch: tensor([352, 253, 340, 255, 274])
Number of edges for each graph element in the batch: tensor([735, 519, 708, 517, 561])
The original graphs in the minibatch:
[Graph(num_nodes=352, num_edges=735,
      ndata_schemes={'OHE': Scheme(shape=(352,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=253, num_edges=519,
      ndata_schemes={'OHE': Scheme(shape=(352,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=340, num_edges=708,
      ndata_schemes={'OHE': Scheme(shape=(352,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=255, num_edges=517,
      ndata_schemes={'OHE': Scheme(shape=(352,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=274, num_edges=561,
      ndata_schemes={'OHE': Scheme(shape=(352,), dtype=torch.float32)}
      edata_schemes={})]


In [17]:
from dgl.nn import GraphConv
import torch
import torch.nn as nn
import torch.nn.functional as F

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        g.ndata['h'] = h
        return dgl.mean_nodes(g, 'h')

In [20]:
model = GCN(352, 16, 2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    for batched_graph, labels in train_dataloader:
        pred = model(batched_graph, batched_graph.ndata['OHE'].float())
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

num_correct = 0
num_tests = 0
for batched_graph, labels in test_dataloader:
    pred = model(batched_graph, batched_graph.ndata['OHE'].float())
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

print('Test accuracy:', num_correct / num_tests)

Test accuracy: 0.3
