In [22]:
with open("readme-Ego.txt") as f:
    print(f.read())

Files:

nodeId.edges : The edges in the ego network for the node 'nodeId'. Edges are undirected for facebook, and directed (a follows b) for twitter and gplus. The 'ego' node does not appear, but it is assumed that they follow every node id that appears in this file.

nodeId.circles : The set of circles for the ego node. Each line contains one circle, consisting of a series of node ids. The first entry in each line is the name of the circle.

nodeId.feat : The features for each of the nodes that appears in the edge file.

nodeId.egofeat : The features for the ego user.

nodeId.featnames : The names of each of the feature dimensions. Features are '1' if the user has this property in their profile, and '0' otherwise. This file has been anonymized for facebook users, since the names of the features would reveal private data.



In [1]:
import snap
Graph = snap.LoadEdgeList(snap.PNGraph,"facebook_combined.txt",0,1)

In [3]:
for node in Graph.Nodes():
    x = str(node.GetId())+'\t'+str(node.GetInDeg())
print(x)

4038	9


In [13]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [7]:
columns = ["edges","circles","feat","egofeat","featnames"]
dataframe = pd.DataFrame(columns=columns)

In [8]:
import os 
print(len(os.listdir("facebook")))

50


In [16]:
import glob
files = "facebook"
print("no of edges: ",len(glob.glob1(files,"*.edges")))
print("no of circles: ",len(glob.glob1(files,"*.circles")))
print("no of features: ",len(glob.glob1(files,"*.feat")))
print("no of ego features: ",len(glob.glob1(files,"*.egofeat")))
print("no of feature names: ",len(glob.glob1(files,"*.featnames")))

no of edges:  10
no of circles:  10
no of features:  10
no of ego features:  10
no of feature names:  10


In [30]:
edges = []
for i in range(0,10):
    edges.append(os.path.join(files,glob.glob1(files,"*.edges")[i]))
print(edges)

['facebook/414.edges', 'facebook/107.edges', 'facebook/348.edges', 'facebook/0.edges', 'facebook/3437.edges', 'facebook/1684.edges', 'facebook/686.edges', 'facebook/698.edges', 'facebook/3980.edges', 'facebook/1912.edges']


In [33]:
def get_circle_labels(G,circles):
    circle_labels = []
    for node in G.nodes():
        is_added = False 
        for i,j in enumerate(list(circles.values())):
            if node in j and not is_added:
                circle_labels.append(i)
                is_added = True 
        if not is_added:
            circle_labels.append(-1)
    return circle_labels

In [34]:
import networkx as nx 
from networkx.algorithms import approximation
G = nx.read_edgelist("facebook/0.edges",nodetype=int)

In [35]:
def read_circle(file_path):
    with open(file_path) as file:
        content = file.readlines()
        content = [line.replace("\n","") for line in content]
        content = [line.split("\t") for line in content]
    return {circle[0]:list(map(int,circle[1:])) for circle in content}

In [36]:
circles = read_circle("facebook/0.circles")

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
from node2vec import Node2Vec
def node2vec_graph(G,D):
    node2vec_ = Node2Vec(G,dimensions=D)
    model = node2vec_.fit()
    embeddings = model.wv.vectors 
    return embeddings

In [46]:
print(len(circles.keys()))

24


In [47]:
embeddings = node2vec_graph(G,24)

Computing transition probabilities:   0%|          | 0/333 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


In [49]:
print(len(embeddings))

333


In [54]:
cosine_similarity(embeddings,embeddings).shape

(333, 333)

In [59]:
df = pd.read_csv("facebook_combined.txt",sep=" ")
df.head()

Unnamed: 0,node1,node2
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5


In [60]:
G = nx.from_pandas_edgelist(df,source="node1",target="node2")

In [64]:
print(G.number_of_nodes())
print(G.number_of_edges())

4039
88234


In [66]:
edge_list = list(zip(df['node1'],df['node2']))
print(len(edge_list))

88234


In [68]:
KG = nx.Graph(edge_list)
KG.number_of_edges(),KG.number_of_nodes()

(88234, 4039)

In [70]:
n2v_obj = Node2Vec(KG,dimensions=64,walk_length=5,num_walks=10,p=1,q=1,workers=1)
model = n2v_obj.fit(window=10,min_count=1,batch_words=4)

Computing transition probabilities:   0%|          | 0/4039 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:05<00:00,  1.79it/s]


In [73]:
pd.DataFrame(model.wv.get_vector("1")).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.109941,0.11437,0.260207,-0.204787,-0.033026,-0.309398,-0.127483,-0.016349,-0.269928,-0.107301,...,0.345343,-0.02656,-0.332942,-0.490678,-0.06564,0.132676,-0.239566,-0.350931,-0.200073,-0.161932


In [74]:
node_list = df.node1.unique()
node_str = []
for n in node_list:
    node_str.append(str(n))

In [75]:
embedding_df = pd.DataFrame()
for i in node_str:
    t1 = pd.DataFrame(model.wv.get_vector(i)).T 
    embedding_df = embedding_df.append(t1)
    embedding_df = embedding_df.reset_index(drop=True)
embedding_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,-0.141577,0.422342,0.741319,0.062844,-0.414088,-0.755713,-0.444874,0.453469,-0.451458,0.068631,...,0.79389,-0.516783,-0.372835,-0.721481,-0.136324,-0.285114,-0.166992,-0.352336,-0.199985,-0.323505
1,0.109941,0.11437,0.260207,-0.204787,-0.033026,-0.309398,-0.127483,-0.016349,-0.269928,-0.107301,...,0.345343,-0.02656,-0.332942,-0.490678,-0.06564,0.132676,-0.239566,-0.350931,-0.200073,-0.161932
2,-0.116622,-0.030876,0.13812,0.278028,-0.150184,-0.749703,-0.204092,0.303191,-0.063505,-0.260101,...,0.639503,-0.327163,-0.099542,-0.595329,-0.155395,0.004247,0.042324,-0.346758,-0.360339,0.166364
3,0.325891,0.044384,0.248671,-0.175599,0.043785,-0.314687,-0.118714,0.006751,-0.285527,-0.056257,...,0.402996,-0.016822,-0.172614,-0.541016,-0.090544,0.245925,-0.301373,-0.384895,-0.212159,-0.149183
4,-0.173682,0.297153,0.392437,-0.150563,0.015349,-0.564219,-0.360578,0.125311,-0.350296,0.155917,...,0.506387,-0.490707,-0.08123,-0.584529,-0.030725,0.037573,0.074391,-0.32074,-0.388316,-0.31825


In [79]:
embedding_df_ = pd.DataFrame(cosine_similarity(embedding_df))

In [80]:
from sklearn.decomposition import PCA
pcm = PCA(n_components=2)
principal_ = pcm.fit_transform(embedding_df_)
principal_df = pd.DataFrame(data=principal_,columns=["pca1","pca2"])
principal_df.head()

Unnamed: 0,pca1,pca2
0,-1.196518,-0.8705
1,-2.407748,-0.189391
2,-1.905012,-0.764189
3,-3.250059,-0.051262
4,-1.866975,-0.381853
