In [51]:
import pickle
import copy
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import networkx as nx
import scipy
import math as math

In [52]:
path = 'data/Human_protein/'

In [53]:
fp = open(path+'missing_edges_final.p', "rb")
missing_edges = pickle.load(fp)


In [54]:
df_neg = pd.DataFrame(list(missing_edges), columns=['Source', 'Destination'])
print (df_neg.shape)
df_neg.head(5)

(6452, 2)


Unnamed: 0,Source,Destination
0,906,1123
1,466,558
2,1949,412
3,610,2000
4,2045,120


In [55]:
df_pos = pd.read_csv(path+'data.csv')
df_pos = df_pos.rename(columns = {'From':'Source', 'To':'Destination'})
df_pos = df_pos.drop_duplicates()
print (df_pos.shape)
df_pos.head(5)

(6452, 2)


Unnamed: 0,Source,Destination
0,1,2
1,3,4
2,5,6
3,1,7
4,8,9


In [56]:
df_pos['Class'] = 1
df_pos.head(5)

Unnamed: 0,Source,Destination,Class
0,1,2,1
1,3,4,1
2,5,6,1
3,1,7,1
4,8,9,1


In [57]:
df_neg['Class'] = 0
df_neg.head(5)

Unnamed: 0,Source,Destination,Class
0,906,1123,0
1,466,558,0
2,1949,412,0
3,610,2000,0
4,2045,120,0


In [58]:
frames = [df_pos[:91500], df_neg]
df = pd.concat(frames)
print(df.shape)
df.head(5)

(12904, 3)


Unnamed: 0,Source,Destination,Class
0,1,2,1
1,3,4,1
2,5,6,1
3,1,7,1
4,8,9,1


In [59]:
df.to_csv(path+'dataset.csv')

In [60]:
g = nx.from_pandas_edgelist(df[['Source','Destination']], source='Source', target='Destination',create_using=nx.DiGraph())
nx.info(g)

'DiGraph with 2240 nodes and 12892 edges'

In [61]:
g.number_of_nodes()

2240

In [62]:
pr = nx.pagerank(g)
df['Page_Rank_Src'] = df.Source.apply(lambda row: pr.get(row))
df['Page_Rank_Dst'] = df.Destination.apply(lambda row: pr.get(row))

In [63]:
# Shortest Path
def get_shortest_path(a, b):
    p=-1
    try:
        if g.has_edge(a,b):
            g.remove_edge(a,b)
            p= nx.shortest_path_length(g,source=a,target=b)
            g.add_edge(a,b)
        else:
            p= nx.shortest_path_length(g,source=a,target=b)
        return p
    except:
        return -1

df['Shortest_Path'] = df.apply(lambda row: get_shortest_path(row['Source'], row['Destination']), axis = 1)

In [64]:
def calc_adar_in(a,b):
    sum=0
    try:
        n=list(set(g.successors(a)).intersection(set(g.successors(b))))
        if len(n)!=0:
            for i in n:
                sum=sum+(1/np.log10(len(list(g.predecessors(i)))))
            return sum
        else:
            return 0
    except:
        return 0

df['adar'] = df.apply(lambda row: calc_adar_in(row['Source'], row['Destination']), axis = 1)

In [65]:
katz = nx.katz.katz_centrality(g,alpha=0.005,beta=1)
mean_katz = float(sum(katz.values())) / len(katz)

df['katz_s'] = df.Source.apply(lambda x: katz.get(x,mean_katz))
df['katz_d'] = df.Destination.apply(lambda x: katz.get(x,mean_katz))

In [66]:
def jaccard_for_followees(a,b):
    try:
        if len(set(g.successors(a))) == 0  | len(set(g.successors(b))) == 0:
            return 0
        sim = (len(set(g.successors(a)).intersection(set(g.successors(b)))))/\
                                    (len(set(g.successors(a)).union(set(g.successors(b)))))
    except:
        return 0
    return sim


def jaccard_for_followers(a,b):
    try:
        if len(set(g.predecessors(a))) == 0  | len(set(g.predecessors(b))) == 0:
            return 0
        sim = (len(set(g.predecessors(a)).intersection(set(g.predecessors(b)))))/\
                                 (len(set(g.predecessors(a)).union(set(g.predecessors(b)))))
        return sim
    except:
        return 0

df['jaccard_followers'] = df.apply(lambda row:
                                        jaccard_for_followers(row['Source'],row['Destination']),axis=1)

df['jaccard_followees'] = df.apply(lambda row:
                                        jaccard_for_followees(row['Source'],row['Destination']),axis=1)

In [67]:
def cosine_for_followees(a,b):
    try:
        if len(set(g.successors(a))) == 0  | len(set(g.successors(b))) == 0:
            return 0
        sim = (len(set(g.successors(a)).intersection(set(g.successors(b)))))/\
                                    (math.sqrt(len(set(g.successors(a)))*len((set(g.successors(b))))))
        return sim
    except:
        return 0

def cosine_for_followers(a,b):
    try:
        
        if len(set(g.predecessors(a))) == 0  | len(set(g.predecessors(b))) == 0:
            return 0
        sim = (len(set(g.predecessors(a)).intersection(set(g.predecessors(b)))))/\
                                     (math.sqrt(len(set(g.predecessors(a))))*(len(set(g.predecessors(b)))))
        return sim
    except:
        return 0

df['cosine_followers'] = df.apply(lambda row:
                                        cosine_for_followees(row['Source'],row['Destination']),axis=1)

df['cosine_followees'] = df.apply(lambda row:
                                        cosine_for_followers(row['Source'],row['Destination']),axis=1)

In [68]:
# wcc=list(nx.weakly_connected_components(g))

# def same_wcc(a,b):
#     index = []
#     if g.has_edge(b,a):
#         return 1
#     if g.has_edge(a,b):
#             for i in wcc:
#                 if a in i:
#                     index= i
#                     break
#             if (b in index):
#                 g.remove_edge(a,b)
#                 if get_shortest_path(a,b)==-1:
#                     g.add_edge(a,b)
#                     return 0
#                 else:
#                     g.add_edge(a,b)
#                     return 1
#             else:
#                 return 0
#     else:
#             for i in wcc:
#                 if a in i:
#                     index= i
#                     break
#             if(b in index):
#                 return 1
#             else:
#                 return 0

# df['wcc'] = df.apply(lambda row: same_wcc(row['Source'],row['Destination']),axis=1)

In [69]:
df.head(5)

Unnamed: 0,Source,Destination,Class,Page_Rank_Src,Page_Rank_Dst,Shortest_Path,adar,katz_s,katz_d,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees
0,1,2,1,0.000253,0.000651,2,0.0,0.020863,0.022439,0.0,0.0,0.0,0.0
1,3,4,1,0.000844,0.000373,5,0.0,0.021082,0.02108,0.0,0.0,0.0,0.0
2,5,6,1,8.7e-05,0.000384,3,0.0,0.020552,0.022019,0.0,0.0,0.0,0.0
3,1,7,1,0.000253,0.00061,2,0.0,0.020863,0.022018,0.0,0.0,0.0,0.0
4,8,9,1,0.000416,0.000489,2,0.0,0.021291,0.022958,0.034483,0.0,0.0,0.016433


In [70]:
df.to_csv(path+'final_dataset.csv')