In [None]:
import os
import networkx as nx
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import *
from functions import draw_graph

In [None]:
fname = os.path.join(dir.graphs_dir, "projected_graph_with_integer_labels.edgelist")
g = nx.read_weighted_edgelist(fname, nodetype=str)
G = StellarGraph.from_networkx(g)

In [None]:
fname = os.path.join(dir.data_dir, "ground_truth_clusters_pickle")
with open(fname, 'rb') as fp:
    clusters = pickle.load(fp)

ground_truth_label = {}

for i, cluster in enumerate(clusters):
    for v in cluster:
        ground_truth_label[v] = i

ground_truth_labels = []
for node in range(234):
    ground_truth_labels.append(ground_truth_label[node])

In [None]:
rw = BiasedRandomWalk(G)

### Evaluation for different values of p, q

In [None]:
p = q = [0.25, 0.5, 1, 2, 4]
arrays = [
    np.array([0.25, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4]),
    np.array([0.25, 0.5, 1, 2, 4]*5)
]
eval_df = pd.DataFrame(index=arrays)

In [None]:
for u in p:
  for v in q:

    weighted_walks = rw.run(
        nodes=G.nodes(),  # root nodes
        length=100,  # maximum length of a random walk
        n=30,  # number of random walks per root node
        p=u,  # Defines (unormalised) probability, 1/p, of returning to source node
        q=v,  # Defines (unormalised) probability, 1/q, for moving away from source node
        weighted=True,  # for weighted random walks
        seed=2
    )

    weighted_model = Word2Vec(weighted_walks, size=128, window=5, min_count=0, sg=1, workers=1, iter=1)

    df = (
        pd.DataFrame(
            [weighted_model.wv[str(n)] for n in range(234)],
            index = [i for i in range(234)]
        )
    )

    embeddings = df.values

    km = KMeans(n_clusters=4)
    km.fit(embeddings)
    labels_pred = km.labels_

    mi = mutual_info_score(labels_pred, ground_truth_labels)
    nmi = normalized_mutual_info_score(labels_pred, ground_truth_labels)
    ami = adjusted_mutual_info_score(labels_pred, ground_truth_labels)
    rand = rand_score(labels_pred, ground_truth_labels)
    hom = homogeneity_score(labels_pred, ground_truth_labels)
    comp = completeness_score(labels_pred, ground_truth_labels)

    eval_df.loc[(u, v), "MI"] = mi
    eval_df.loc[(u, v), "NMI"] = nmi
    eval_df.loc[(u, v), "AMI"] = ami
    eval_df.loc[(u, v), "Rand"] = rand
    eval_df.loc[(u, v), "Hom"] = hom
    eval_df.loc[(u, v), "Comp"] = comp

In [None]:
fname = os.path.join(dir.tables_dir, "kmeans_metrics_different_p_q.csv")
eval_df.to_csv(fname)

### p=q=0.25

In [None]:
weighted_walks = rw.run(
    nodes=G.nodes(),  # root nodes
    length=100,  # maximum length of a random walk
    n=30,  # number of random walks per root node
    p=0.25,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=0.25,  # Defines (unormalised) probability, 1/q, for moving away from source node
    weighted=True,  # for weighted random walks
    seed=2
)

### Evaluation for different dimesions

In [None]:
eval_df = pd.DataFrame(index=[4, 8, 16, 32, 64, 128])

In [None]:
for dimensions in [4, 8, 16, 32, 64, 128]:
  weighted_model = Word2Vec(weighted_walks, size=dimensions, window=5, min_count=0, sg=1, workers=1, iter=1)

  df = (
    pd.DataFrame(
        [weighted_model.wv[str(n)] for n in range(234)],
        index = [i for i in range(234)]
      )
  )
  embeddings = df.values

  km = KMeans(n_clusters=4)
  km.fit(embeddings)
  labels_pred = km.labels_

  mi = mutual_info_score(labels_pred, ground_truth_labels)
  nmi = normalized_mutual_info_score(labels_pred, ground_truth_labels)
  ami = adjusted_mutual_info_score(labels_pred, ground_truth_labels)
  rand = rand_score(labels_pred, ground_truth_labels)
  hom = homogeneity_score(labels_pred, ground_truth_labels)
  comp = completeness_score(labels_pred, ground_truth_labels)

  eval_df.loc[dimensions, "MI"] = mi
  eval_df.loc[dimensions, "NMI"] = nmi
  eval_df.loc[dimensions, "AMI"] = ami
  eval_df.loc[dimensions, "Rand"] = rand
  eval_df.loc[dimensions, "Hom"] = hom
  eval_df.loc[dimensions, "Comp"] = comp

In [None]:
fname = os.path.join(dir.tables_dir, "kmeans_metrics_different_dimensions.csv")
eval_df.to_csv(fname)

### p=0.25, q=0.25, dimensions=8

### Evaluation for k=4 and k=5

In [None]:
weighted_model = Word2Vec(weighted_walks, size=8, window=5, min_count=0, sg=1, workers=1, iter=1)

In [None]:
df = (
    pd.DataFrame(
        [weighted_model.wv[str(n)] for n in range(234)],
        index = [i for i in range(234)]
    )
)
embeddings = df.values

### k=4 

In [None]:
km = KMeans(n_clusters=4)
km.fit(embeddings)
labels_pred_4_clusters = km.labels_

In [None]:
mi = mutual_info_score(labels_pred_4_clusters, ground_truth_labels)
nmi = normalized_mutual_info_score(labels_pred_4_clusters, ground_truth_labels)
ami = adjusted_mutual_info_score(labels_pred_4_clusters, ground_truth_labels)
rand = rand_score(labels_pred_4_clusters, ground_truth_labels)
hom = homogeneity_score(labels_pred_4_clusters, ground_truth_labels)
comp = completeness_score(labels_pred_4_clusters, ground_truth_labels)

In [None]:
eval_df_4_clusters = pd.DataFrame([[4, mi, nmi, ami, rand, hom, comp]], 
                columns=['Clusters', 'MI', 'NMI', 'AMI', 'Rand', 'Hom', 'Comp'])

### k=5

In [None]:
km = KMeans(n_clusters=5)
km.fit(embeddings)
labels_pred_5_clusters = km.labels_

In [None]:
mi = mutual_info_score(labels_pred_5_clusters, ground_truth_labels)
nmi = normalized_mutual_info_score(labels_pred_5_clusters, ground_truth_labels)
ami = adjusted_mutual_info_score(labels_pred_5_clusters, ground_truth_labels)
rand = rand_score(labels_pred_5_clusters, ground_truth_labels)
hom = homogeneity_score(labels_pred_5_clusters, ground_truth_labels)
comp = completeness_score(labels_pred_5_clusters, ground_truth_labels)

In [None]:
eval_df_5_clusters = pd.DataFrame([[5, mi, nmi, ami, rand, hom, comp]], 
                columns=['Clusters', 'MI', 'NMI', 'AMI', 'Rand', 'Hom', 'Comp'])

In [None]:
df = pd.concat([eval_df_4_clusters, eval_df_5_clusters])
fname = os.path.join(dir.tables_dir, "kmeans_metrics_different_k.csv")
df.to_csv(fname)

In [None]:
a = df.loc[df.Clusters == 4, ['MI','NMI','AMI', 'Rand', 'Hom', 'Comp']].values.flatten().tolist()
b = df.loc[df.Clusters == 5, ['MI','NMI','AMI', 'Rand', 'Hom', 'Comp']].values.flatten().tolist()

In [None]:
barWidth = 0.25
fig = plt.subplots(figsize =(12, 8))

br1 = np.arange(len(a))
br2 = [x + barWidth for x in br1]

plt.bar(br1, a, color ='r', width = barWidth,
        edgecolor ='grey', label ='k=4')
plt.bar(br2, b, color ='g', width = barWidth,
        edgecolor ='grey', label ='k=5')

plt.xticks([r + barWidth for r in range(len(a))],
        ['MI', 'NMI', 'AMI', 'Rand', 'Hom', 'Comp'])

fname = os.path.join(dir.plots_dir, "metrics_for_different_k.png")
plt.legend()
plt.savefig(fname, bbox_inches='tight', pad_inches=0)

### Graph Visualisation

In [None]:
party_length = {}
for i in range(4):
  count = 0
  for val in labels_pred_4_clusters:
      if val == i:
          count += 1
  party_length[i] = count

{0: 9, 1: 72, 2: 131, 3: 22}

In [None]:
colors = {}

for k, v in party_length.items():
    if v == 9:
        colors[k] = 'k'
    elif v == 22:
        colors[k] = 'g'
    elif v == 72:
        colors[k] = 'r'
    else:
        colors[k] = 'b'

In [None]:
labels = {n: ground_truth_label[int(n)] for n in G.nodes()}
color_map = [colors[labels_pred_4_clusters[int(n)]] for n in G.nodes()]
fname = os.path.join(dir.plots_dir, "kmeans_graph.png")
draw_graph(g, labels, color_map, fname)