#Установка пакетов и получение данных

In [None]:
!git clone https://github.com/stellargraph/stellargraph.git

Cloning into 'stellargraph'...
remote: Enumerating objects: 22627, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 22627 (delta 37), reused 61 (delta 20), pack-reused 22531[K
Receiving objects: 100% (22627/22627), 93.44 MiB | 28.55 MiB/s, done.
Resolving deltas: 100% (14721/14721), done.


In [None]:
import sys
sys.path.append('/content/stellargraph')

In [None]:
!pip3 install kaggle



Перед выполнением следующей ячейки нужно получить свой токен Kaggle:
- зайдите на kaggle.com
- перейдите на страницу своего аккаунта
- промотайте страницу до секции API section
- нажмите кнопку "Create new API tocken" чтобы скачать ваш токен kaggle.json

In [None]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"suleymanov","key":"694bec8d8670aa538e986f41392d3ee5"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
%%time
!kaggle datasets download -d manoelribeiro/hateful-users-on-twitter

Downloading hateful-users-on-twitter.zip to /content
 99% 1.62G/1.63G [00:20<00:00, 156MB/s]
100% 1.63G/1.63G [00:20<00:00, 86.6MB/s]
CPU times: user 207 ms, sys: 54.2 ms, total: 261 ms
Wall time: 21.1 s


In [None]:
%%time
!unzip hateful-users-on-twitter.zip

Archive:  hateful-users-on-twitter.zip
  inflating: users.edges             
  inflating: users_clean.graphml     
  inflating: users_hate_all.content  
  inflating: users_hate_glove.content  
  inflating: users_neighborhood_anon.csv  
  inflating: users_suspended_all.content  
  inflating: users_suspended_glove.content  
CPU times: user 249 ms, sys: 33.9 ms, total: 283 ms
Wall time: 46.2 s


In [None]:
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import stellargraph as sg
from tensorflow import keras
import matplotlib.pyplot as plt
from stellargraph import StellarGraph
# from sklearn.decomposition import PCA
from stellargraph.mapper import GraphSAGENodeGenerator, GraphSAGELinkGenerator
from stellargraph.layer import Node2Vec, link_classification, GraphSAGE
from stellargraph.data import BiasedRandomWalk, EdgeSplitter, UniformRandomWalk, UnsupervisedSampler
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator

# Загрузка данных

Сначала мы хотим загрузить данные о сети Twitter и о комментариях и посмотреть на них.

In [None]:
twdata = pd.read_csv('users_hate_glove.content', sep='\t', header=None)
print(np.unique(twdata[301], return_counts=True))

(array(['hateful', 'normal', 'other'], dtype=object), array([  544,  4427, 95415]))


In [None]:
twnet = nx.read_edgelist('users.edges')
print(np.info(twnet))

Base class for undirected graphs.

A Graph stores nodes and edges with optional data, or attributes.

Graphs hold undirected edges.  Self loops are allowed but multiple
(parallel) edges are not.

Nodes can be arbitrary (hashable) Python objects with optional
key/value attributes, except that `None` is not allowed as a node.

Edges are represented as links between nodes with optional
key/value attributes.

Parameters
----------
incoming_graph_data : input graph (optional, default: None)
    Data to initialize graph. If None (default) an empty
    graph is created.  The data can be any format that is supported
    by the to_networkx_graph() function, currently including edge list,
    dict of dicts, dict of lists, NetworkX graph, 2D NumPy array, SciPy
    sparse matrix, or PyGraphviz graph.

attr : keyword arguments, optional (default= no attributes)
    Attributes to add to graph as key=value pairs.

See Also
--------
DiGraph
MultiGraph
MultiDiGraph

Examples
--------
Create an empty gr

# Задача

Мы хотим научиться предсказывать класс узла, используя одновременно информацию о взаимном расположении узлов в сети и информацию о комментариях.

**Задача** - двухклассовая классификация;    
**Целевая метка** - тип узла (нормальный, другой, хейтер);    
**Используемые данные** - эмбеддинг узла и GloVe-эмбеддинг комментариев.

Методы:
1. Node2Vec+GloVe;
2. GraphSage.

# Предобработка

Конвертируем граф в StellarGraph формат:

In [None]:
G = StellarGraph.from_networkx(twnet)

Вытаскиваем GloVe из таблицы:

In [None]:
glove = twdata[twdata.columns[1:301]].values
Y = twdata[301].values
print(glove.shape)

(100386, 300)


Разбиваем на обучающую и тестовую выборку:

In [None]:
glove_nh = glove[Y != 'other']
Y_nh = Y[Y != 'other']
node_names = twdata[0][Y != 'other'].values
print(glove_nh.shape)

(4971, 300)


In [None]:
Y_nh = np.array([['normal', 'hateful'].index(a) for a in Y_nh])
print(Y_nh)

[0 0 0 ... 0 1 0]


# Node2Vec+GloVe

In [None]:
batch_size = 64
epochs = 2
emb_size = 128
walk_number = 10
walk_length = 5

In [None]:
rw = BiasedRandomWalk(G, n=walk_number, length=walk_length, p=0.5, q=2.0)
unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=rw)
generator = Node2VecLinkGenerator(G, batch_size)
node2vec = Node2Vec(emb_size, generator=generator)
x_inp, x_out = node2vec.in_out_tensors()

In [None]:
x_inp

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_1')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_2')>]

In [None]:
x_out

[<KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'reshape')>,
 <KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'reshape_1')>]

In [None]:
prediction = link_classification(output_dim=1, output_act='sigmoid', edge_embedding_method='dot')(x_out)
model = keras.Model(inputs=x_inp, outputs=prediction)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss=keras.losses.binary_crossentropy)

link_classification: using 'dot' method to combine node embeddings into edge embeddings


In [None]:
%%time
history = model.fit(generator.flow(unsupervised_samples), epochs=epochs, verbose=1, shuffle=True)

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 target_embedding (Embedding)   (None, 1, 128)       12849408    ['input_1[0][0]']                
                                                                                                  
 context_embedding (Embedding)  (None, 1, 128)       12849408    ['input_2[0][0]']                
                                                                                              

## Постобработка для Node2Vec + GloVe

In [None]:
x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [None]:
print(node_names)

[     0     22     29 ... 100362 100380 100385]


In [None]:
node_gen = Node2VecNodeGenerator(G, batch_size).flow([str(a) for a in node_names])
node_embeddings = embedding_model.predict(node_gen, verbose=1)
print(node_embeddings.shape)
print(glove_nh.shape)

(4971, 128)
(4971, 300)


In [None]:
node_glove = np.concatenate([node_embeddings, glove_nh], 1)
print(node_glove.shape)

(4971, 428)


## Обучение и проверка классификатора

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, f1_score

In [None]:
train_ix, test_ix = train_test_split(np.arange(Y_nh.shape[0]), stratify=Y_nh)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(node_glove[train_ix], Y_nh[train_ix])

In [None]:
Y_hat_n2v = dt.predict(node_glove[test_ix])

In [None]:
print('Balance accuracy score: {:.3f}'.format(balanced_accuracy_score(y_pred=Y_hat_n2v, y_true=Y_nh[test_ix])))
print('ROC-AUC score: {:.3f}'.format(roc_auc_score(y_score=Y_hat_n2v, y_true=Y_nh[test_ix])))
print('F1 score: {:.3f}'.format(f1_score(y_pred=Y_hat_n2v, y_true=Y_nh[test_ix])))

Balance accuracy score: 0.649
ROC-AUC score: 0.649
F1 score: 0.382


# GraphSage

In [None]:
print(G.node_features())

In [None]:
node_data = pd.DataFrame(glove, index=[str(a) for a in np.arange(glove.shape[0])])

In [None]:
G2 = StellarGraph.from_networkx(twnet, node_features=node_data)

In [None]:
batch_size = 64
epochs = 1
number_of_walks = 1
length = 5
num_samples = [10, 5]
layer_sizes = [50, 50]

In [None]:
unsupervised_samples = UnsupervisedSampler(
    G2,
    nodes=list(G2.nodes()),
    length=length,
    number_of_walks=number_of_walks
)

In [None]:
generator = GraphSAGELinkGenerator(G2, batch_size, num_samples)

In [None]:
# train_gen = generator.flow(unsupervised_samples)

In [None]:
graphsage = GraphSAGE(layer_sizes, generator=generator, bias=True)

In [None]:
x_inp, x_out = graphsage.in_out_tensors()



In [None]:
prediction = link_classification(output_dim=1, output_act='sigmoid', edge_embedding_method='ip')(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [None]:
model = keras.Model(inputs=x_inp, outputs=prediction)

In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss=keras.losses.binary_crossentropy)

In [None]:
%%time
history = model.fit(generator.flow(unsupervised_samples), epochs=epochs, verbose=1, shuffle=True)

CPU times: user 14min 30s, sys: 14 s, total: 14min 44s
Wall time: 13min 34s


## Постобработка для GraphSage

In [None]:
x_inp_src = x_inp[0::2]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [None]:
node_gen = GraphSAGENodeGenerator(G2, batch_size, num_samples).flow([str(a) for a in node_names])

In [None]:
node_embeddings = np.row_stack([embedding_model.predict(b[0], verbose=0) for b in node_gen])
print(node_embeddings.shape)

(4971, 50)


## Обучение и проверка классификатора

In [None]:
train_ix, test_ix = train_test_split(np.arange(Y_nh.shape[0]), stratify=Y_nh)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(node_embeddings[train_ix], Y_nh[train_ix])

In [None]:
Y_hat_n2v = dt.predict(node_embeddings[test_ix])

In [None]:
print('Balance accuracy score: {}'.format(balanced_accuracy_score(y_pred=Y_hat_n2v, y_true=Y_nh[test_ix])))
print('ROC-AUC score: {}'.format(roc_auc_score(y_score=Y_hat_n2v, y_true=Y_nh[test_ix])))
print('F1 score: {}'.format(f1_score(y_pred=Y_hat_n2v, y_true=Y_nh[test_ix])))

Balance accuracy score: 0.7186354216483342
ROC-AUC score: 0.718635421648334
F1 score: 0.46451612903225803
