# Exam: MA-INF4316 Graph Representation Learning

- Wintersemester 2021/2022
- Exam: 1
- Date 2022-02-21
- Examiner: Dr. Pascal Welke

### To be filled by the student
- Name: Aleksei
- Given Name: Zhuravlev
- Matriculation number:
- Course of Studies: MSc Computer Science

(Please enter your data here)

# Task 1

## Task 1.1

In [None]:
import igraph

g = igraph.Graph.Read_Pickle('twitch.pickle')

print('vertices', len(g.vs))
print('edges', len(g.es))

vertices 34118
edges 429113


In [None]:
import numpy as np

print('mean degree', np.mean(g.degree(g.vs)))

mean degree 25.154639779588486


## Task 1.2

In [None]:
def compute_sane_density(vertex, graph):
    neighbors = graph.neighbors(vertex)
    degree = len(neighbors)
    
    if degree <= 1:
        return 0
    else:   
        subgraph = graph.induced_subgraph(neighbors)
        density = 2 * len(subgraph.es) / (degree * (degree - 1))
        return density

In [None]:
vertex_ids = [42, 123, 11024, 11585, 12280, 34117]

for vertex in vertex_ids:
    print(vertex, compute_sane_density(vertex, g))

42 0.12323232323232323
123 0.11956521739130435
11024 0
11585 0.06159420289855073
12280 0
34117 0.20512820512820512


## Task 1.3

In [None]:
degeneracies = g.coreness()
pageranks = g.pagerank()
degrees = g.degree(g.vs)

sane_densities = [compute_sane_density(vertex, g) for vertex in g.vs]    

In [None]:
feature_vectors = np.array(list(zip(degrees, pageranks, degeneracies, sane_densities)))
g.vs['features'] = feature_vectors
g.vs[vertex_ids]['features']

[array([4.50000000e+01, 4.04561421e-05, 3.00000000e+01, 1.23232323e-01]),
 array([2.40000000e+01, 2.96387338e-05, 1.50000000e+01, 1.19565217e-01]),
 array([1.00000000e+00, 7.06692425e-06, 1.00000000e+00, 0.00000000e+00]),
 array([2.40000000e+01, 6.10208326e-05, 1.10000000e+01, 6.15942029e-02]),
 array([1.00000000e+00, 6.69281948e-06, 1.00000000e+00, 0.00000000e+00]),
 array([1.30000000e+01, 2.18730179e-05, 1.20000000e+01, 2.05128205e-01])]

## Task 1.4

In [None]:
train_g = g.induced_subgraph(g.vs.select(lang_in=['DE','ENGB', 'FR','RU']))
val_g = g.induced_subgraph(g.vs.select(lang_eq='ES'))
test_g = g.induced_subgraph(g.vs.select(lang_eq='PTBR'))

print(f'train, vertices: {len(train_g.vs)}, edges: {len(train_g.es)}')
print(f'validation, vertices: {len(val_g.vs)}, edges: {len(val_g.es)}')
print(f'test, vertices: {len(test_g.vs)}, edges: {len(test_g.es)}')

train, vertices: 27558, edges: 338432
validation, vertices: 4648, edges: 59382
test, vertices: 1912, edges: 31299


## Task 1.5

In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

parameters = {'kernel':('linear', 'rbf'), 'C':[0.001, 0.01, 0.1]}

scores = []

for kernel in ['linear', 'rbf']:
    for C in [0.001, 0.01, 0.1]:
        svc = svm.SVC(kernel=kernel, C=C)
        svc.fit(train_g.vs['features'], train_g.vs['label'])
        y_pred = svc.predict(val_g.vs['features'])
        score = accuracy_score(val_g.vs['label'], y_pred)
        scores.append({
            'kernel': kernel,
            'C': C,
            'accuracy': score
        })
        
        print({
            'kernel': kernel,
            'C': C,
            'accuracy': score
        })
        
print(
    sorted(scores, key=lambda x: x['accuracy'], reverse=True)
)

{'kernel': 'linear', 'C': 0.001, 'accuracy': 0.6693201376936316}
{'kernel': 'linear', 'C': 0.01, 'accuracy': 0.6701807228915663}
{'kernel': 'linear', 'C': 0.1, 'accuracy': 0.6699655765920827}
{'kernel': 'rbf', 'C': 0.001, 'accuracy': 0.7074010327022375}
{'kernel': 'rbf', 'C': 0.01, 'accuracy': 0.6805077452667814}
{'kernel': 'rbf', 'C': 0.1, 'accuracy': 0.6688898450946644}
[{'kernel': 'rbf', 'C': 0.001, 'accuracy': 0.7074010327022375}, {'kernel': 'rbf', 'C': 0.01, 'accuracy': 0.6805077452667814}, {'kernel': 'linear', 'C': 0.01, 'accuracy': 0.6701807228915663}, {'kernel': 'linear', 'C': 0.1, 'accuracy': 0.6699655765920827}, {'kernel': 'linear', 'C': 0.001, 'accuracy': 0.6693201376936316}, {'kernel': 'rbf', 'C': 0.1, 'accuracy': 0.6688898450946644}]


## Task 1.6

In [None]:
svc_best = svm.SVC(kernel='rbf', C=0.001)
X_train = np.concatenate([train_g.vs['features'], val_g.vs['features']])
y_train = np.concatenate([train_g.vs['label'], val_g.vs['label']])

svc.fit(X_train, y_train)
y_pred = svc.predict(test_g.vs['features'])
score = accuracy_score(test_g.vs['label'], y_pred)

print(list(zip(y_pred[0:10], test_g.vs['label'][0:10])))
print(score)

[(-1, -1), (-1, -1), (1, -1), (1, -1), (-1, -1), (-1, -1), (1, 1), (-1, -1), (-1, 1), (-1, 1)]
0.6427824267782427


## Task 1.7

In [None]:
# look at what class dominates
classes, counts = np.unique(y_train, return_counts = True)
print(np.array([classes, counts]).T)

[[   -1 17725]
 [    1 14481]]


In [None]:
print('accuracy of a majority vote classifier: ', counts[0] / sum(counts))

accuracy of a majority vote classifier:  0.5503632863441594


Is your result from Task 1.6 satisfactory?

- By using vertex features we got a 10% increase in accuracy, so it was at least worth the effort. But 65% accuracy is still quite low and we should work on improving our model, e.g. by adding more features.

# Task 2

## Task 2.1

In [None]:
from tqdm import tqdm

def update(v, upd, agg, rk, r_0): 
    neighbors = v.neighbors() #return List of igraph vertices
    multiset = [rk[neighbor.index] for neighbor in neighbors]
    aggregation = agg(multiset)
    return upd(rk[v.index], aggregation, r_0)

def mpf(g, r0, k, upd, agg):
    #Contains for each h and aLL vertices aLL LabeLs r_h(v), i.e messages[0][0] = r_0(u_0)
    messages = []
    #do the first iteration with r0
    messages.append([r0(v) for v in g.vs])
    for i in range(1, k+1):
        messages.append(np.array([update(v, upd, agg, messages[-1], r0(v)) for v in g.vs]))
    return messages

In [None]:
def agg_func(multiset):
    return sum(multiset)

def upd_func(previous, aggregation, r_0):
    return r_0 + aggregation

## Task 2.2

All vertices v should have the same color at the beginning, so we initialize r_0 to a set of 1.

## Task 2.3

In [None]:
def r_0(v):
    return 1

messages = np.array(mpf(g, r_0, 3, upd_func, agg_func))

vertex_ids = [42, 123, 11024, 11585, 12280, 34117]
for i in [0, 1, 3]:
    print(messages[i, vertex_ids])

[1 1 1 1 1 1]
[46 25  2 25  2 14]
[1210751  182653     469   33511    3429  145136]


## Task 2.4

# Task 3

In [None]:
pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.13.0+cu117.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.13.0+cu117.html
Collecting pyg-lib
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu117/pyg_lib-0.1.0%2Bpt113cu117-cp38-cp38-linux_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu117/torch_scatter-2.1.0%2Bpt113cu117-cp38-cp38-linux_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu117/torch_sparse-0.6.16%2Bpt113cu117-cp38-cp38-linux_x86_64.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollect

In [None]:
import torch
import torch.nn.functional as F
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.datasets import TUDataset
from torch_geometric.data import DataLoader
from torch_geometric.nn import GINConv, global_max_pool, MLP
import numpy as np

dataset = TUDataset(root='/tmp/DHFR',  name='DHFR')
test_dataset = dataset[:len(dataset) // 10]
train_dataset = dataset[len(dataset) // 10:]
test_loader = DataLoader(test_dataset, batch_size=128)
train_loader = DataLoader(train_dataset, batch_size=128)



In [None]:
def train(epoch, model, optimizer):
    model.train()

    if epoch == 51:
        for param_group in optimizer.param_groups:
            param_group['lr'] = 0.5 * param_group['lr']
    
    if epoch == 76:
        for param_group in optimizer.param_groups:
            param_group['lr'] = 0.5 * param_group['lr']

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)
        loss = F.nll_loss(output, data.y)
        loss.backward()
        loss_all += loss.item() * data.num_graphs
        optimizer.step()
    return loss_all / len(train_dataset)


def test(loader, model):
    model.eval()

    correct = 0
    for data in loader:
        data = data.to(device)
        output = model(data.x, data.edge_index, data.batch)
        pred = output.max(dim=1)[1]
        correct += pred.eq(data.y).sum().item()
    return correct / len(loader.dataset)

## Task 3.1

In [None]:
class Net(torch.nn.Module):
    def __init__(self, num_layers):
        super().__init__()

        self.convs = torch.nn.ModuleList()

        input_channels = dataset.num_node_features
        for k in range(num_layers):
            output_channels = 2 ** (7 - k + 1)
            mlp = MLP([input_channels, 32, output_channels])
            self.convs.append(GINConv(nn=mlp, train_eps=False))
            input_channels = output_channels

        self.mlp = MLP([input_channels, 32, dataset.num_classes])

    def forward(self, x, edge_index, batch):
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        x = global_max_pool(x, batch)
        x = self.mlp(x)

        return F.log_softmax(x, dim=1)

## Task 3.2

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
verbose = False

def train_GIN(n_layers):
    model = Net(n_layers)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

    for epoch in range(1, 101):
        train_loss = train(epoch, model, optimizer)
    
    train_acc = test(train_loader, model)
    test_acc = test(test_loader, model)

    return train_loss, train_acc, test_acc


for n_layers in [2,3,4,5]:
    train_loss, train_acc, test_acc = train_GIN(n_layers)
    print(f'model = {n_layers} layers')
    print('Train Loss: {:.7f}, '
        'Train Acc: {:.7f}, Test Acc: {:.7f}'.format(train_loss,
                                                train_acc, test_acc))

model = 2 layers
Train Loss: 0.0292484, Train Acc: 0.9471366, Test Acc: 0.7466667
model = 3 layers
Train Loss: 0.0247370, Train Acc: 0.9603524, Test Acc: 0.7466667
model = 4 layers
Train Loss: 0.0833111, Train Acc: 0.9030837, Test Acc: 0.5866667
model = 5 layers
Train Loss: 0.1045515, Train Acc: 0.8928047, Test Acc: 0.4666667


Analysis: 
We can see that the test accuracy is the highest for the network with 3 layers. For 2 layers, the NN overfits the train set a little bit, and for 4 and 5 layers both the train and test set accuracies fall.

The best choice is the NN with 3 layers, because it has the best test set accuracy and reasonable train set accuracy

## Task 3.3

In [None]:
train_acc_list = []
test_acc_list = []

for _ in range(6):
    train_loss, train_acc, test_acc = train_GIN(3)
    train_acc_list.append(train_acc)
    test_acc_list.append(test_acc)

print(f"""
Train accuracy
mean {np.mean(train_acc_list)}, std {np.std(train_acc_list)}
Test accuracy
mean {np.mean(test_acc_list)}, std {np.std(test_acc_list)}
"""
)



Train accuracy
mean 0.9280469897209985, std 0.02164791423151834
Test accuracy
mean 0.6377777777777778, std 0.06773222512945853



## Task 3.4

In [None]:
class NetLarge(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.convs = torch.nn.ModuleList()

        input_channels = dataset.num_node_features

        mlp = MLP([input_channels, 32, 64])
        self.convs.append(GINConv(nn=mlp, train_eps=False))
        mlp = MLP([64, 32, 64])
        self.convs.append(GINConv(nn=mlp, train_eps=False))

        self.mlp = MLP([64, 32, dataset.num_classes])

    def forward(self, x, edge_index, batch):
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        x = global_max_pool(x, batch)
        x = self.mlp(x)

        return F.log_softmax(x, dim=1)


class NetSmall(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.convs = torch.nn.ModuleList()

        input_channels = dataset.num_node_features

        mlp = MLP([input_channels, 64])
        self.convs.append(GINConv(nn=mlp, train_eps=False))
        mlp = MLP([64, 64])
        self.convs.append(GINConv(nn=mlp, train_eps=False))

        self.mlp = MLP([64, dataset.num_classes])

    def forward(self, x, edge_index, batch):
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        x = global_max_pool(x, batch)
        x = self.mlp(x)

        return F.log_softmax(x, dim=1)

In [None]:
for model, name in zip([NetLarge(), NetSmall()], ['NetLarge', 'NetSmall']):
    
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

    for epoch in range(1, 101):
        train_loss = train(epoch, model, optimizer)

    train_acc = test(train_loader, model)
    test_acc = test(test_loader, model)

    print(f'model = {name}')
    print('Train Loss: {:.7f}, '
        'Train Acc: {:.7f}, Test Acc: {:.7f}'.format(train_loss,
                                                train_acc, test_acc))

model = NetLarge
Train Loss: 0.0712441, Train Acc: 0.9162996, Test Acc: 0.6400000
model = NetSmall
Train Loss: 0.4026426, Train Acc: 0.8208517, Test Acc: 0.6533333


## Task 3.5

Analysis: accuracy of NetLarge and NetSmall on the test set are equal, and the NetLarge better fits the train set. As a result, less sophisticated GINs can be as expressive as very complex GINs.

## Task 4

Done in a separate notebook

# Task 5

## Task 5.1

Let $\mathcal{G}=\left\{\right.$ graphlet $_1$, graphlet $_2, \ldots$, graphlet $\left._r\right\}$ be the set of size- $k$ graphlets. 
Let also $f_G \in \mathbb{N}^r$ be a vector such that its $i$-th entry is equal to the frequency of occurrence of graphlet $_i$ in $G$, $f_{G, i}=\#\left(\right.$ graphlet $\left._i \sqsubseteq G\right)$. 
$f_G$ is the representation we are looking for.

## Task 5.2

## Task 5.3

$r_X(G_1) = [2, 2]$, $r_X(G_2) = [1, 1]$

$k_X(G_1, G_2) = 2 * 1 + 2 * 1 = 4$