# Exam: MA-INF4316 Graph Representation Learning

- Wintersemester 2021/2022
- Exam: 1
- Date 2022-02-21
- Examiner: Dr. Pascal Welke

### To be filled by the student
- Name: Aleksei
- Given Name: Zhuravlev
- Matriculation number:
- Course of Studies: MSc Computer Science

(Please enter your data here)

# Task 1

## Task 1.1

In [1]:
import igraph

g = igraph.Graph.Read_Pickle('twitch.pickle')

print('vertices', len(g.vs))
print('edges', len(g.es))

vertices 34118
edges 429113


In [2]:
import numpy as np

print('mean degree', np.mean(g.degree(g.vs)))

mean degree 25.154639779588486


## Task 1.2

In [3]:
def compute_sane_density(vertex, graph):
    neighbors = graph.neighbors(vertex)
    degree = len(neighbors)
    
    if degree <= 1:
        return 0
    else:   
        subgraph = graph.induced_subgraph(neighbors)
        density = 2 * len(subgraph.es) / (degree * (degree - 1))
        return density

In [4]:
vertex_ids = [42, 123, 11024, 11585, 12280, 34117]

for vertex in vertex_ids:
    print(vertex, compute_sane_density(vertex, g))

42 0.12323232323232323
123 0.11956521739130435
11024 0
11585 0.06159420289855073
12280 0
34117 0.20512820512820512


## Task 1.3

In [5]:
degeneracies = g.coreness()
pageranks = g.pagerank()
degrees = g.degree(g.vs)

sane_densities = [compute_sane_density(vertex, g) for vertex in g.vs]    

In [6]:
feature_vectors = np.array(list(zip(degrees, pageranks, degeneracies, sane_densities)))
g.vs['features'] = feature_vectors
g.vs[vertex_ids]['features']

[array([4.50000000e+01, 4.04561421e-05, 3.00000000e+01, 1.23232323e-01]),
 array([2.40000000e+01, 2.96387338e-05, 1.50000000e+01, 1.19565217e-01]),
 array([1.00000000e+00, 7.06692425e-06, 1.00000000e+00, 0.00000000e+00]),
 array([2.40000000e+01, 6.10208326e-05, 1.10000000e+01, 6.15942029e-02]),
 array([1.00000000e+00, 6.69281948e-06, 1.00000000e+00, 0.00000000e+00]),
 array([1.30000000e+01, 2.18730179e-05, 1.20000000e+01, 2.05128205e-01])]

## Task 1.4

In [7]:
train_g = g.induced_subgraph(g.vs.select(lang_in=['DE','ENGB', 'FR','RU']))
val_g = g.induced_subgraph(g.vs.select(lang_eq='ES'))
test_g = g.induced_subgraph(g.vs.select(lang_eq='PTBR'))

print(f'train, vertices: {len(train_g.vs)}, edges: {len(train_g.es)}')
print(f'validation, vertices: {len(val_g.vs)}, edges: {len(val_g.es)}')
print(f'test, vertices: {len(test_g.vs)}, edges: {len(test_g.es)}')

train, vertices: 27558, edges: 338432
validation, vertices: 4648, edges: 59382
test, vertices: 1912, edges: 31299


## Task 1.5

In [8]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

parameters = {'kernel':('linear', 'rbf'), 'C':[0.001, 0.01, 0.1]}

scores = []

for kernel in ['linear', 'rbf']:
    for C in [0.001, 0.01, 0.1]:
        svc = svm.SVC(kernel=kernel, C=C)
        svc.fit(train_g.vs['features'], train_g.vs['label'])
        y_pred = svc.predict(val_g.vs['features'])
        score = accuracy_score(val_g.vs['label'], y_pred)
        scores.append({
            'kernel': kernel,
            'C': C,
            'accuracy': score
        })
        
        print({
            'kernel': kernel,
            'C': C,
            'accuracy': score
        })
        
print(
    sorted(scores, key=lambda x: x['accuracy'], reverse=True)
)

{'kernel': 'linear', 'C': 0.001, 'accuracy': 0.6693201376936316}
{'kernel': 'linear', 'C': 0.01, 'accuracy': 0.6701807228915663}
{'kernel': 'linear', 'C': 0.1, 'accuracy': 0.6699655765920827}
{'kernel': 'rbf', 'C': 0.001, 'accuracy': 0.7074010327022375}
{'kernel': 'rbf', 'C': 0.01, 'accuracy': 0.6805077452667814}
{'kernel': 'rbf', 'C': 0.1, 'accuracy': 0.6688898450946644}
[{'kernel': 'rbf', 'C': 0.001, 'accuracy': 0.7074010327022375}, {'kernel': 'rbf', 'C': 0.01, 'accuracy': 0.6805077452667814}, {'kernel': 'linear', 'C': 0.01, 'accuracy': 0.6701807228915663}, {'kernel': 'linear', 'C': 0.1, 'accuracy': 0.6699655765920827}, {'kernel': 'linear', 'C': 0.001, 'accuracy': 0.6693201376936316}, {'kernel': 'rbf', 'C': 0.1, 'accuracy': 0.6688898450946644}]


## Task 1.6

In [10]:
svc_best = svm.SVC(kernel='rbf', C=0.001)
X_train = np.concatenate([train_g.vs['features'], val_g.vs['features']])
y_train = np.concatenate([train_g.vs['label'], val_g.vs['label']])

svc.fit(X_train, y_train)
y_pred = svc.predict(test_g.vs['features'])
score = accuracy_score(test_g.vs['label'], y_pred)

print(list(zip(y_pred[0:10], test_g.vs['label'][0:10])))
print(score)

[(-1, -1), (-1, -1), (1, -1), (1, -1), (-1, -1), (-1, -1), (1, 1), (-1, -1), (-1, 1), (-1, 1)]
0.6427824267782427


## Task 1.7

In [20]:
# look at what class dominates
classes, counts = np.unique(y_train, return_counts = True)
print(np.array([classes, counts]).T)

[[   -1 17725]
 [    1 14481]]


In [21]:
print('accuracy of a majority vote classifier: ', counts[0] / sum(counts))

accuracy of a majority vote classifier:  0.5503632863441594


Is your result from Task 1.6 satisfactory?

- By using vertex features we got a 10% increase in accuracy, so it was at least worth the effort. But 65% accuracy is still quite low and we should work on improving our model, e.g. by adding more features.

# Task 2

## Task 2.1

In [47]:
from tqdm import tqdm

def update(v, upd, agg, rk, r_0): 
    neighbors = v.neighbors() #return List of igraph vertices
    multiset = [rk[neighbor.index] for neighbor in neighbors]
    aggregation = agg(multiset)
    return upd(rk[v.index], aggregation, r_0)

def mpf(g, r0, k, upd, agg):
    #Contains for each h and aLL vertices aLL LabeLs r_h(v), i.e messages[0][0] = r_0(u_0)
    messages = []
    #do the first iteration with r0
    messages.append([r0(v) for v in g.vs])
    for i in range(1, k+1):
        messages.append(np.array([update(v, upd, agg, messages[-1], r0(v)) for v in g.vs]))
    return messages

In [46]:
def agg_func(multiset):
    return sum(multiset)

def upd_func(previous, aggregation, r_0):
    return r_0 + aggregation

## Task 2.2

All vertices v should have the same color at the beginning, so we initialize r_0 to a set of 1.

## Task 2.3

In [48]:
def r_0(v):
    return 1

messages = np.array(mpf(g, r_0, 3, upd_func, agg_func))

vertex_ids = [42, 123, 11024, 11585, 12280, 34117]
for i in [0, 1, 3]:
    print(messages[i, vertex_ids])

[1 1 1 1 1 1]
[46 25  2 25  2 14]
[1210751  182653     469   33511    3429  145136]


## Task 2.4

# Task 3

In [50]:
import torch
import torch.nn.functional as F
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.datasets import TUDataset
from torch_geometric.data import DataLoader
from torch_geometric.nn import GINConv, global_max_pool
import numpy as np

dataset = TUDataset(root='/tmp/DHFR',  name='DHFR')
test_dataset = dataset[:len(dataset) // 10]
train_dataset = dataset[len(dataset) // 10:]
test_loader = DataLoader(test_dataset, batch_size=128)
train_loader = DataLoader(train_dataset, batch_size=128)

FileNotFoundError: Could not find module 'C:\ProgramData\Anaconda3\Lib\site-packages\torch_sparse\_convert_cuda.pyd' (or one of its dependencies). Try using the full path with constructor syntax.

In [None]:
def train(epoch):
    model.train()

    if epoch == 51:
        for param_group in optimizer.param_groups:
            param_group['lr'] = 0.5 * param_group['lr']
    
    if epoch == 76:
        for param_group in optimizer.param_groups:
            param_group['lr'] = 0.5 * param_group['lr']

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)
        loss = F.nll_loss(output, data.y)
        loss.backward()
        loss_all += loss.item() * data.num_graphs
        optimizer.step()
    return loss_all / len(train_dataset)


def test(loader):
    model.eval()

    correct = 0
    for data in loader:
        data = data.to(device)
        output = model(data.x, data.edge_index, data.batch)
        pred = output.max(dim=1)[1]
        correct += pred.eq(data.y).sum().item()
    return correct / len(loader.dataset)

## Task 3.1

In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

    def forward(self, x, edge_index, batch):
        pass

## Task 3.2

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
verbose = False

# obviously, the models in this list need to be defined
for model in [Net2(), Net3(), Net4(), Net5()]:
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

    for epoch in range(1, 101):
        train_loss = train(epoch)
        if verbose:
            train_acc = test(train_loader)
            test_acc = test(test_loader)
            print('Epoch: {:03d}, Train Loss: {:.7f}, '
                'Train Acc: {:.7f}, Test Acc: {:.7f}'.format(epoch, train_loss,
                                                        train_acc, test_acc))
    
    print(f'model = {model.__class__}')
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print('Epoch: {:03d}, Train Loss: {:.7f}, '
        'Train Acc: {:.7f}, Test Acc: {:.7f}'.format(epoch, train_loss,
                                                train_acc, test_acc))


## Task 3.3

## Task 3.4

## Task 3.5

# Task 4

## Task 4.1

## Task 4.2

## Task 4.3

## Task 4.4

## Task 4.5

## Task 4.6

## Task 4.7

# Task 5

## Task 5.1

## Task 5.2

## Task 5.3