In [9]:
import networkx as nx
import pandas as pd

df = pd.read_csv('./data/subset_complete.csv')
df = df.drop(columns=['Unnamed: 0'])
g = nx.from_pandas_edgelist(df[['Source','Destination']], source='Source', target='Destination',create_using=nx.DiGraph())
print(g)

DiGraph with 1000 nodes and 4297 edges


In [10]:
# Page Rank
pr = nx.pagerank(g)
df['Page_Rank_Src'] = df.Source.apply(lambda row: pr.get(row))
df['Page_Rank_Dst'] = df.Destination.apply(lambda row: pr.get(row))

In [11]:
# Shortest Path
def get_shortest_path(x, y):
    d = -1
    try:
        if g.has_edge(x, y):
            g.remove_edge(x, y)
            d = nx.shortest_path_length(g, source=x, target=y)
            g.add_edge(x, y)
        else:
            d = nx.shortest_path_length(g, source=x, target=y)
    except:
        d = -1
    return d

df['Shortest_Path'] = df.apply(lambda row: get_shortest_path(row['Source'], row['Destination']), axis = 1)

In [12]:
df.head(10)

Unnamed: 0,Source,Destination,Class,Page_Rank_Src,Page_Rank_Dst,Shortest_Path
0,63500,11402,1,0.000259,0.003184,5
1,63500,101884,1,0.000259,0.001268,5
2,63500,8593,1,0.000259,0.00604,4
3,63500,52828,1,0.000259,0.001212,5
4,63500,17364,1,0.000259,0.00152,4
5,16408,7962,1,0.001257,0.001644,5
6,16408,15547,1,0.001257,0.002545,4
7,16408,5648,1,0.001257,0.002208,4
8,16408,16767,1,0.001257,0.001641,5
9,16408,27660,1,0.001257,0.002124,5


In [13]:
# Follow Features
followers_src, followers_dst, followees_src, followees_dst, int_followers, int_followees = [], [], [], [], [], []

for i, r in df.iterrows():
    pre_src = set(g.predecessors(r['Source'])) if set(g.predecessors(r['Source'])) else set()
    suc_src = set(g.successors(r['Source'])) if set(g.successors(r['Source'])) else set()

    pre_dst = set(g.predecessors(r['Destination'])) if set(g.predecessors(r['Destination'])) else set()
    suc_dst = set(g.successors(r['Destination'])) if set(g.successors(r['Destination'])) else set()

    followers_src.append(len(pre_src))
    followees_src.append(len(suc_src))

    followers_dst.append(len(pre_dst))
    followees_dst.append(len(suc_dst))

    int_followers.append(len(pre_src.intersection(pre_dst)))
    int_followees.append(len(suc_src.intersection(suc_dst)))
        
df['Followers_Src'] = followers_src
df['Followees_Src'] = followees_src
df['Followers_Dst'] = followers_dst
df['Followees_Dst'] = followees_dst
df['Int_Followers'] = int_followers
df['Int_Followees'] = int_followees

In [14]:
df.head(5)

Unnamed: 0,Source,Destination,Class,Page_Rank_Src,Page_Rank_Dst,Shortest_Path,Followers_Src,Followees_Src,Followers_Dst,Followees_Dst,Int_Followers,Int_Followees
0,63500,11402,1,0.000259,0.003184,5,0,6,22,0,0,0
1,63500,101884,1,0.000259,0.001268,5,0,6,5,0,0,0
2,63500,8593,1,0.000259,0.00604,4,0,6,39,2,0,0
3,63500,52828,1,0.000259,0.001212,5,0,6,5,0,0,0
4,63500,17364,1,0.000259,0.00152,4,0,6,13,0,0,0


In [15]:
df.to_csv('./data/final_dataset.csv')