<font color=red><p style="font-size:32px;text-align:center"><b>FB Featurization</b></p></font>

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pylab as plt
import seaborn as sns

import pickle
import networkx as nx
import math
import os

from pandas import HDFStore,DataFrame
from pandas import read_hdf
from scipy.sparse.linalg import svds, eigs
from tqdm import tqdm

<font color=blue><h3>Reading Data</h3></font>

In [2]:
train_graph = nx.read_edgelist('D:/Applied_Ai/Case Studies/Facebook/My Work/after_eda/train_pos_after_eda.csv', delimiter=',', create_using=nx.DiGraph(), nodetype=int)

In [3]:
print(nx.info(train_graph))

Name: 
Type: DiGraph
Number of nodes: 1780722
Number of edges: 7550015
Average in degree:   4.2399
Average out degree:   4.2399


<font color=blue><h3>Similarity Measures</h3></font>

<font color=green><h4>1. Jaccard Distance</h4></font>

\begin{equation}
j = \frac{|X\cap Y|}{|X \cup Y|} 
\end{equation}

In [4]:
#jaccard_distance_for_followees_
def jaccard_for_followees(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0 | len(set(train_graph.successors(b))) == 0:
            return 0
        sim = (len(set(train_graph.successors(a))).intersection(len(set(train_graph.successors(b)))))/\
                        (len(set(train_graph.successors(a))).union(len(set(train_graph.successors(b)))))
    except:
        return 0
    return sim        

In [5]:
#print_test_case_
print(jaccard_for_followees(13567,12464))

0


In [6]:
#jaccard_distance_for_followers_
def jaccard_for_followers(a,b):
    try:
        if len(set(train_graph.predecessors(a))) == 0 | len(set(train_graph.predecessors(b))) == 0:
            return 0
        sim = (len(set(train_graph.predecessors(a))).intersection(len(set(train_graph.predecessors(b)))))/\
                        (len(set(train_graph.predecessors(a))).union(len(set(train_graph.predecessors(b)))))
    except:
        return 0
    return sim        

In [7]:
print(jaccard_for_followers(13567,12464))

0


<font color=green><h4>2. Cosine Distance</h4></font>

\begin{equation}
CosineDistance = \frac{|X\cap Y|}{|X|\cdot|Y|} 
\end{equation}

In [8]:
#cosine_distance_for_followees_
def cosine_for_followees(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0  | len(set(train_graph.successors(b))) == 0:
            return 0
        sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
                                    (math.sqrt(len(set(train_graph.successors(a)))*len((set(train_graph.successors(b))))))
        return sim
    except:
        return 0        

In [9]:
#print_test_case_
print(cosine_for_followees(13567,12464))

0.0


In [10]:
#cosine_distance_for_followers_
def cosine_for_followers(a,b):
    try:
        
        if len(set(train_graph.predecessors(a))) == 0  | len(set(train_graph.predecessors(b))) == 0:
            return 0
        sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
                                     (math.sqrt(len(set(train_graph.predecessors(a))))*(len(set(train_graph.predecessors(b)))))
        return sim
    except:
        return 0

In [11]:
#print_test_case_
print(cosine_for_followers(2,1635354))

0


<font color=blue><h3>Ranking Measures</h3></font>

<font color=green><h4>Page Ranking</h4></font>

In [12]:
if not os.path.isfile('D:/Applied_Ai/Case Studies/Facebook/My Work/fea_sample/page_rank.p'):
    pr = nx.pagerank(train_graph, alpha=0.85)
    pickle.dump(pr, open('D:/Applied_Ai/Case Studies/Facebook/My Work/fea_sample/page_rank.p','wb'))
else:
    pr = pickle.load(open('D:/Applied_Ai/Case Studies/Facebook/My Work/fea_sample/page_rank.p','rb'))

In [13]:
print('Min', pr[min(pr, key=pr.get)])
print('Max', pr[max(pr, key=pr.get)])
print('Mean', float(sum(pr.values()))/len(pr))

Min 1.6556497245737814e-07
Max 2.7098251341935827e-05
Mean 5.615699699389075e-07


In [14]:
#for imputing to nodes which are not there in Train data
mean_pr = float(sum(pr.values()))/len(pr)
print(mean_pr)

5.615699699389075e-07


<font color=blue><h3>Other Graph Features</h3></font>

<font color=green><h4>1. Shortest Path</h4></font>

Getting shortest path between two nodes, if nodes have direct path i.e directly connected then we are removing that edge and calculating path.

In [15]:
#if has direct edge then deleting that edge and calculating shortest path
def compute_shortest_path_length(a,b):
    p=-1
    try:
        if train_graph.has_edge(a,b):
            train_graph.remove_edge(a,b)
            p= nx.shortest_path_length(train_graph,source=a,target=b)
            train_graph.add_edge(a,b)
        else:
            p= nx.shortest_path_length(train_graph,source=a,target=b)
        return p
    except:
        return -1

In [16]:
#print_test_case_
print(compute_shortest_path_length(27458,1635354))

-1


<font color=green><h4>2. Checking Same Community</h4></font>

In [17]:
#getting weakly connected edges from graph 
wcc = list(nx.weakly_connected_components(train_graph))

def belongs_to_same_wcc(a,b):
    index = []
    if train_graph.has_edge(b,a):
        return 1
    if train_graph.has_edge(a,b):
            for i in wcc:
                if a in i:
                    index= i
                    break
            if (b in index):
                train_graph.remove_edge(a,b)
                if compute_shortest_path_length(a,b)==-1:
                    train_graph.add_edge(a,b)
                    return 0
                else:
                    train_graph.add_edge(a,b)
                    return 1
            else:
                return 0
    else:
            for i in wcc:
                if a in i:
                    index= i
                    break
            if(b in index):
                return 1
            else:
                return 0

In [18]:
#print_test_case_
print(belongs_to_same_wcc(858,163554))

1


In [19]:
#print_test_case_
print(belongs_to_same_wcc(669354,1635354))

0


<font color=green><h4>3. Adamic/Adar Index</h4></font>

Adamic/Adar measures is defined as inverted sum of degrees of common neighbours for given two vertices.
$$A(x,y)=\sum_{u \in N(x) \cap N(y)}\frac{1}{log(|N(u)|)}$$

In [20]:
#adar_index
def calc_adar_in(a,b):
    sum = 0
    try:
        n = list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
        if len(n)!= 0:
            for i in n:
                sum = sum+(1/np.log10(len(list(train_graph.predecessors(i)))))
            return sum
        else:
            return 0
    except:
        return 0

In [21]:
#print_test_case_
print(calc_adar_in(6354,1535))

0


<font color=green><h4>4. Is Person Was Following Back</h4></font>

In [22]:
def follows_back(a,b):
    if train_graph.has_edge(b,a):
        return 1
    else:
        return 0

In [23]:
#print_test_case_
print(follows_back(1,189226))

1


In [24]:
#print_test_case_
print(follows_back(14845,1896))

0


<font color=green><h4>5. Katz Centrality</h4></font>

Katz centrality computes the centrality for a node 
    based on the centrality of its neighbors. It is a 
    generalization of the eigenvector centrality. The
    Katz centrality for node `i` is
 
$$x_i = \alpha \sum_{j} A_{ij} x_j + \beta,$$
where `A` is the adjacency matrix of the graph G 
with eigenvalues $$\lambda$$.

The parameter $$\beta$$ controls the initial centrality and 

$$\alpha < \frac{1}{\lambda_{max}}.$$

In [25]:
if not os.path.isfile('D:/Applied_Ai/Case Studies/Facebook/My Work/fea_sample/katz.p'):
    katz = nx.katz.katz_centrality(train_graph, alpha=0.005, beta=1)
    pickle.dump(katz,open('D:/Applied_Ai/Case Studies/Facebook/My Work/fea_sample/katz.p','wb'))
else:
    katz = pickle.load(open('D:/Applied_Ai/Case Studies/Facebook/My Work/fea_sample/katz.p','rb'))

In [26]:
print('Min', katz[min(katz, key=katz.get)])
print('Max', katz[max(katz, key=katz.get)])
print('Mean', float(sum(katz.values()))/len(katz))

Min 0.0007313532484065916
Max 0.003394554981699122
Mean 0.0007483800935562018


In [27]:
mean_katz = float(sum(katz.values()))/len(katz)
print(mean_katz)

0.0007483800935562018


<font color=green><h4>6. Hits Score</h4></font>

The HITS algorithm computes two numbers for a node. Authorities estimates the node value based on the incoming links. Hubs estimates the node value based on outgoing links.

In [28]:
if not os.path.isfile('D:/Applied_Ai/Case Studies/Facebook/My Work/fea_sample/hits.p'):
    hits = nx.hits(train_graph, max_iter=100, tol=1e-08, nstart=None, normalized=True)
    pickle.dump(hits,open('D:/Applied_Ai/Case Studies/Facebook/My Work/fea_sample/hits.p','wb'))
else:
    hits = pickle.load(open('D:/Applied_Ai/Case Studies/Facebook/My Work/fea_sample/hits.p','rb'))

In [29]:
print('Min', hits[0][min(hits[0], key=hits[0].get)])
print('Max', hits[0][max(hits[0], key=hits[0].get)])
print('Mean', float(sum(hits[0].values()))/len(hits[0]))

Min 0.0
Max 0.004868653378780953
Mean 5.615699699344123e-07


<font color=blue><h3>Featurization</h3></font>

<font color=gren><h4>Reading Sample Data from Both Train and Test</h4></font>

In [30]:
import random
if os.path.isfile('D:/Applied_Ai/Case Studies/Facebook/My Work/after_eda/train_after_eda.csv'):
    filename = "train_after_eda.csv"
    #uncomment this line, if you dont know the lentgh of the file name
    #here we have hardcoded the number of lines as 15100030
    #n_train = sum(1 for line in open(filename)) #number of records in file (excludes header)
    
    n_train =  15100028
    s = 100000 #desired sample size
    
    skip_train = sorted(random.sample(range(1, n_train+1), n_train-s))

In [31]:
if os.path.isfile('D:/Applied_Ai/Case Studies/Facebook/My Work/after_eda/train_after_eda.csv'):
    filename = "test_after_eda.csv"
    #uncomment this line, if you dont know the lentgh of the file name
    #here we have hardcoded the number of lines as 3775008
    #n_test = sum(1 for line in open(filename)) #number of records in file (excludes header)
    
    n_test = 3775006
    s = 50000 #desired sample size
    
    skip_test = sorted(random.sample(range(1, n_test+1), n_test-s))

In [32]:
print("NUMBER OF ROWS IN THE TRAIN DATA FILE -", n_train)
print("NUMBER OF ROWS WE ARE GOING TO ELIMIATE IN TRAIN DATA ARE",len(skip_train))
print("NUMBER OF ROWS IN THE TEST DATA FILE -", n_test)
print("NUMBER OF ROWS WE ARE GOING TO ELIMIATE IN TEST DATA ARE",len(skip_test))

NUMBER OF ROWS IN THE TRAIN DATA FILE - 15100028
NUMBER OF ROWS WE ARE GOING TO ELIMIATE IN TRAIN DATA ARE 15000028
NUMBER OF ROWS IN THE TEST DATA FILE - 3775006
NUMBER OF ROWS WE ARE GOING TO ELIMIATE IN TEST DATA ARE 3725006


In [33]:
train_data_final = pd.read_csv('D:/Applied_Ai/Case Studies/Facebook/My Work/after_eda/train_after_eda.csv', skiprows=skip_train, names=['source_node', 'destination_node'])
train_data_final["indicator_link"] = pd.read_csv('D:/Applied_Ai/Case Studies/Facebook/My Work/train_y.csv', skiprows=skip_train, names=['indicator_link'])

print("SIZE OF TRAIN MATRIX -",train_data_final.shape)
train_data_final.head()

SIZE OF TRAIN MATRIX - (100002, 3)


Unnamed: 0,source_node,destination_node,indicator_link
0,273084,1505602,1
1,1284805,1252082,1
2,1764008,713641,1
3,462722,1698321,1
4,996276,883507,1


In [34]:
test_data_final = pd.read_csv('D:/Applied_Ai/Case Studies/Facebook/My Work/after_eda/test_after_eda.csv', skiprows=skip_test, names=['source_node', 'destination_node'])
test_data_final["indicator_link"] = pd.read_csv('D:/Applied_Ai/Case Studies/Facebook/My Work/test_y.csv', skiprows=skip_test, names=['indicator_link'])

print("SIZE OF TEST MATRIX -",test_data_final.shape)
test_data_final.head()

SIZE OF TEST MATRIX - (50002, 3)


Unnamed: 0,source_node,destination_node,indicator_link
0,848424,784690,1
1,1549883,966112,1
2,471540,1086897,1
3,264958,339685,1
4,1288618,816620,1


<font color=gren><h4>Adding a Set of Features</h4></font>

__We will create these each of these features for both train and test data points__
<ol>
<li>jaccard_followers</li>
<li>jaccard_followees</li>
<li>cosine_followers</li>
<li>cosine_followees</li>
<li>num_followers_s</li>
<li>num_followees_s</li>
<li>num_followers_d</li>
<li>num_followees_d</li>
<li>inter_followers</li>
<li>inter_followees</li>
</ol>

In [35]:
def compute_features_stage1(final_data):
    #calculating no of followers & followees for source and destination
    #calculating intersection of followers & followees for source and destination
    num_followers_s = [] #number of followers of source
    num_followees_s = [] #number of followees of source
    num_followers_d = [] #number of followers of destination
    num_followees_d = [] #number of followees of destination
    inter_followers = [] #number of common followers
    inter_followees = [] #number of common followees
    
    for i,row in final_data.iterrows():
        try:
            s1 = set(train_graph.predecessors(row['source_node']))
            s2 = set(train_graph.successors(row['source_node']))
        except:
            s1 = set()
            s2 = set()
        try:
            d1 = set(train_graph.predecessors(row['destination_node']))
            d2 = set(train_graph.successors(row['destination_node']))
        except:
            d1 = set()
            d2 = set()
            
        num_followers_s.append(len(s1))
        num_followees_s.append(len(s2))

        num_followers_d.append(len(d1))
        num_followees_d.append(len(d2))

        inter_followers.append(len(s1.intersection(d1)))
        inter_followees.append(len(s2.intersection(d2)))
    
    return  num_followers_s, num_followees_s, num_followers_d ,num_followees_d, inter_followers, inter_followees

In [36]:
if not os.path.isfile('storage_sample_stage1.h5'):
    #mapping jaccrd followers to train and test data
    train_data_final['jaccard_followers'] = train_data_final.apply(lambda row:
                                            jaccard_for_followers(row['source_node'],row['destination_node']),axis=1)
    test_data_final['jaccard_followers'] = test_data_final.apply(lambda row:
                                            jaccard_for_followers(row['source_node'],row['destination_node']),axis=1)
    
    #mapping jaccrd followees to train and test data
    train_data_final['jaccard_followees'] = train_data_final.apply(lambda row:
                                            jaccard_for_followees(row['source_node'],row['destination_node']),axis=1)
    test_data_final['jaccard_followees'] = test_data_final.apply(lambda row:
                                            jaccard_for_followees(row['source_node'],row['destination_node']),axis=1)
    #----------------------------------------------------------------------------------------------------------------
    #mapping cosine followers to train and test data
    train_data_final['cosine_followers'] = train_data_final.apply(lambda row:
                                            cosine_for_followers(row['source_node'],row['destination_node']),axis=1)
    test_data_final['cosine_followers'] = test_data_final.apply(lambda row:
                                            cosine_for_followers(row['source_node'],row['destination_node']),axis=1)

    #mapping cosine followees to train and test data
    train_data_final['cosine_followees'] = train_data_final.apply(lambda row:
                                            cosine_for_followees(row['source_node'],row['destination_node']),axis=1)
    test_data_final['cosine_followees'] = test_data_final.apply(lambda row:
                                            cosine_for_followees(row['source_node'],row['destination_node']),axis=1)
    #----------------------------------------------------------------------------------------------------------------
    #compute_features_stage1 to train and test data
    train_data_final['num_followers_s'], train_data_final['num_followers_d'], \
    train_data_final['num_followees_s'], train_data_final['num_followees_d'], \
    train_data_final['inter_followers'], train_data_final['inter_followees'] = compute_features_stage1(train_data_final)
    
    test_data_final['num_followers_s'], test_data_final['num_followers_d'], \
    test_data_final['num_followees_s'], test_data_final['num_followees_d'], \
    test_data_final['inter_followers'], test_data_final['inter_followees'] = compute_features_stage1(test_data_final)
    #----------------------------------------------------------------------------------------------------------------
    
    hdf = HDFStore('storage_sample_stage1.h5')
    hdf.put('train_df', train_data_final, format='table', data_columns=True)
    hdf.put('test_df', test_data_final, format='table', data_columns=True)
    hdf.close()
    
else:
    train_data_final = read_hdf('storage_sample_stage1.h5', 'train_df', mode='r')
    test_data_final = read_hdf('storage_sample_stage1.h5', 'test_df', mode='r')

<font color=gren><h4>Adding New Set of Features</h4></font>

__We will create these each of these features for both train and test data points__
<ol>
<li>Adar Index</li>
<li>Is following back</li>
<li>Belongs to same weakly connect components</li>
<li>Shortest path between source and destination</li>
</ol>

In [37]:
if not os.path.isfile('storage_sample_stage2.h5'):
    #mapping adar index on train and test data
    train_data_final['adar_index'] = train_data_final.apply(lambda row: calc_adar_in(row['source_node'], row['destination_node']), axis=1)
    
    test_data_final['adar_index'] = test_data_final.apply(lambda row: calc_adar_in(row['source_node'], row['destination_node']), axis=1)
    #--------------------------------------------------------------------------------------------------------
    #mapping followback or not on train and test data
    train_data_final['follows_back'] = train_data_final.apply(lambda row: follows_back(row['source_node'], row['destination_node']), axis=1)

    test_data_final['follows_back'] = test_data_final.apply(lambda row: follows_back(row['source_node'], row['destination_node']), axis=1)
    #--------------------------------------------------------------------------------------------------------
    #mapping same component of wcc or not on train and test data
    train_data_final['same_comp'] = train_data_final.apply(lambda row: belongs_to_same_wcc(row['source_node'], row['destination_node']), axis=1)

    test_data_final['same_comp'] = test_data_final.apply(lambda row: belongs_to_same_wcc(row['source_node'], row['destination_node']), axis=1)
    #--------------------------------------------------------------------------------------------------------
    #mapping shortest path on train and test data
    train_data_final['shortest_path'] = train_data_final.apply(lambda row: compute_shortest_path_length(row['source_node'], row['destination_node']), axis=1)
    
    test_data_final['shortest_path'] = test_data_final.apply(lambda row: compute_shortest_path_length(row['source_node'], row['destination_node']), axis=1)
    #----------------------------------------------------------------------------------------------------------------
    
    hdf = HDFStore('storage_sample_stage2.h5')
    hdf.put('train_df', train_data_final, format='table', data_columns=True)
    hdf.put('test_df', test_data_final, format='table', data_columns=True)
    hdf.close()
    
else:
    train_data_final = read_hdf('storage_sample_stage2.h5', 'train_df', mode='r')
    test_data_final = read_hdf('storage_sample_stage2.h5', 'test_df', mode='r')

In [38]:
train_data_final.head()

Unnamed: 0,source_node,destination_node,indicator_link,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees,adar_index,follows_back,same_comp,shortest_path
0,273084,1505602,1,0,0,0.0,0.0,11,15,6,8,0,0,0.0,0,1,4
1,1284805,1252082,1,0,0,0.0,0.0,70,222,3,3,0,0,0.0,1,1,3
2,1764008,713641,1,0,0,0.0,0.0,28,31,9,7,0,0,0.0,0,1,3
3,462722,1698321,1,0,0,0.0,0.0,1,22,15,16,0,0,0.0,0,1,3
4,996276,883507,1,0,0,0.019838,0.0,21,24,11,5,1,0,0.0,1,1,3


<font color=gren><h4>Adding New Set of Features</h4></font>

__We will create these each of these features for both train and test data points__
<ol>
<li>Weight Features
    <ul>
        <li>Weight of incoming edges</li>
        <li>Weight of outgoing edges</li>
        <li>Weight of incoming edges + Weight of outgoing edges</li>
        <li>Weight of incoming edges * Weight of outgoing edges</li>
        <li>2*Weight of incoming edges + Weight of outgoing edges</li>
        <li>Weight of incoming edges + 2*Weight of outgoing edges</li>
    </ul>
</li>
<li>Page Ranking of Source</li>
<li>Page Ranking of Destination</li>
<li>Katz of source</li>
<li>Katz of dest</li>
<li>Hubs of source</li>
<li>Hubs of dest</li>
<li>Authorities_s of Source</li>
<li>Authorities_s of Destination</li>
</ol>

#### Weight Features

\begin{equation}
W = \frac{1}{\sqrt{1+|X|}}
\end{equation}

it is directed graph so calculated Weighted in and Weighted out differently

In [39]:
#weight for source and destination of each link
Weight_in = {}
Weight_out = {}

for i in  tqdm(train_graph.nodes()):
    s1 = set(train_graph.predecessors(i))
    w_in = 1.0/(np.sqrt(1+len(s1)))
    Weight_in[i] = w_in
    
    s2 = set(train_graph.successors(i))
    w_out = 1.0/(np.sqrt(1+len(s2)))
    Weight_out[i] = w_out
    
#for imputing with mean
mean_weight_in = np.mean(list(Weight_in.values()))
mean_weight_out = np.mean(list(Weight_out.values()))

100%|████████████████████████████████████████████████████████████████████| 1780722/1780722 [00:13<00:00, 134520.33it/s]


In [40]:
if not os.path.isfile('storage_sample_stage3.h5'):
    
    train_data_final['weight_in'] = train_data_final.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
    train_data_final['weight_out'] = train_data_final.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))

    test_data_final['weight_in'] = test_data_final.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
    test_data_final['weight_out'] = test_data_final.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))
    #----------------------------------------------------------------------------------------------------------------
    train_data_final['weight_f1'] = train_data_final.weight_in + train_data_final.weight_out
    train_data_final['weight_f2'] = train_data_final.weight_in * train_data_final.weight_out
    train_data_final['weight_f3'] = (2*train_data_final.weight_in + 1*train_data_final.weight_out)
    train_data_final['weight_f4'] = (1*train_data_final.weight_in + 2*train_data_final.weight_out)

    test_data_final['weight_f1'] = test_data_final.weight_in + test_data_final.weight_out
    test_data_final['weight_f2'] = test_data_final.weight_in * test_data_final.weight_out
    test_data_final['weight_f3'] = (2*test_data_final.weight_in + 1*test_data_final.weight_out)
    test_data_final['weight_f4'] = (1*test_data_final.weight_in + 2*test_data_final.weight_out)

In [41]:
if not os.path.isfile('storage_sample_stage3.h5'):
    #page rank for source and destination in train and test
    #if anything not there in train graph then adding mean page rank 
    train_data_final['page_rank_s'] = train_data_final.source_node.apply(lambda x:pr.get(x,mean_pr))
    train_data_final['page_rank_d'] = train_data_final.destination_node.apply(lambda x:pr.get(x,mean_pr))

    test_data_final['page_rank_s'] = test_data_final.source_node.apply(lambda x:pr.get(x,mean_pr))
    test_data_final['page_rank_d'] = test_data_final.destination_node.apply(lambda x:pr.get(x,mean_pr))
    #----------------------------------------------------------------------------------------------------------------
    #Katz centrality score for source and destination in train and test
    #if anything not there in train graph then adding mean katz score
    train_data_final['katz_s'] = train_data_final.source_node.apply(lambda x: katz.get(x,mean_katz))
    train_data_final['katz_d'] = train_data_final.destination_node.apply(lambda x: katz.get(x,mean_katz))

    test_data_final['katz_s'] = test_data_final.source_node.apply(lambda x: katz.get(x,mean_katz))
    test_data_final['katz_d'] = test_data_final.destination_node.apply(lambda x: katz.get(x,mean_katz))
    #----------------------------------------------------------------------------------------------------------------
    #Hits algorithm score for source and destination in train and test
    #if anything not there in train graph then adding 0
    train_data_final['hubs_s'] = train_data_final.source_node.apply(lambda x: hits[0].get(x,0))
    train_data_final['hubs_d'] = train_data_final.destination_node.apply(lambda x: hits[0].get(x,0))

    test_data_final['hubs_s'] = test_data_final.source_node.apply(lambda x: hits[0].get(x,0))
    test_data_final['hubs_d'] = test_data_final.destination_node.apply(lambda x: hits[0].get(x,0))
    #----------------------------------------------------------------------------------------------------------------
    #Hits algorithm score for source and destination in Train and Test
    #if anything not there in train graph then adding 0
    train_data_final['authorities_s'] = train_data_final.source_node.apply(lambda x: hits[1].get(x,0))
    train_data_final['authorities_d'] = train_data_final.destination_node.apply(lambda x: hits[1].get(x,0))

    test_data_final['authorities_s'] = test_data_final.source_node.apply(lambda x: hits[1].get(x,0))
    test_data_final['authorities_d'] = test_data_final.destination_node.apply(lambda x: hits[1].get(x,0))
    #----------------------------------------------------------------------------------------------------------------

    hdf = HDFStore('storage_sample_stage3.h5')
    hdf.put('train_df', train_data_final, format='table', data_columns=True)
    hdf.put('test_df', test_data_final, format='table', data_columns=True)
    hdf.close()
else:
    train_data_final = read_hdf('storage_sample_stage3.h5', 'train_df', mode='r')
    test_data_final = read_hdf('storage_sample_stage3.h5', 'test_df', mode='r')

In [42]:
train_data_final.head()

Unnamed: 0,source_node,destination_node,indicator_link,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees,num_followers_s,num_followers_d,num_followees_s,...,weight_f3,weight_f4,page_rank_s,page_rank_d,katz_s,katz_d,hubs_s,hubs_d,authorities_s,authorities_d
0,273084,1505602,1,0,0,0.0,0.0,11,15,6,...,1.005929,0.877964,2.04529e-06,3.459963e-07,0.000773,0.000756,1.943132e-13,1.941103e-13,9.226339e-16,2.231877e-15
1,1284805,1252082,1,0,0,0.0,0.0,70,222,3,...,1.066965,0.63393,5.733452e-06,2.202787e-07,0.001007,0.000744,3.821852e-09,7.418056e-16,9.628271e-14,2.688395e-11
2,1764008,713641,1,0,0,0.0,0.0,28,31,9,...,0.809232,0.669781,1.7851e-06,1.112847e-06,0.000843,0.000766,2.078025e-13,1.648915e-16,8.932667e-16,3.010892e-15
3,462722,1698321,1,0,0,0.0,0.0,1,22,15,...,0.708514,0.667029,1.846584e-07,5.703804e-07,0.000735,0.000791,2.444041e-13,4.082758e-15,2.812397e-15,6.476138e-14
4,996276,883507,1,0,0,0.019838,0.0,21,24,11,...,0.77735,0.688675,1.632419e-06,3.324746e-07,0.000816,0.00078,3.14102e-13,7.733657e-15,5.007674e-14,1.023551e-13


<font color=gren><h4>Adding New Set of Features</h4></font>

__We will create these each of these features for both train and test data points__
<ol>
- **SVD Features** for both source and destination</li>
</ol>

In [43]:
#ref: https://stats.stackexchange.com/questions/455334/singular-value-decomposition-svd-for-feature-selection
#ref: https://analyticsindiamag.com/singular-value-decomposition-svd-application-recommender-system/
def svd(x, S):
    try:
        z = sadj_dict[x]
        return S[z]
    except:
        return [0,0,0,0,0,0]

In [44]:
#for svd features to get feature vector creating a dict node val and index in svd vector
sadj_col = sorted(train_graph.nodes())
sadj_dict = { val:idx for idx,val in enumerate(sadj_col)}

In [45]:
Adj = nx.adjacency_matrix(train_graph, nodelist=sorted(train_graph.nodes())).asfptype()

In [46]:
U, s, V = svds(Adj, k=6)

print('Adjacency Matrix Shape', Adj.shape)
print('U Shape', U.shape)
print('V Shape', V.shape)
print('s Shape', s.shape)

Adjacency Matrix Shape (1780722, 1780722)
U Shape (1780722, 6)
V Shape (6, 1780722)
s Shape (6,)


In [2]:
if not os.path.isfile('storage_sample_stage4.h5'):
    
    train_data_final[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
    train_data_final.source_node.apply(lambda x: svd(x, U)).apply(pd.Series)
    
    train_data_final[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
    train_data_final.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series)
    #----------------------------------------------------------------------------------------------------------------
    train_data_final[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
    train_data_final.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)

    train_data_final[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
    train_data_final.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)
    #----------------------------------------------------------------------------------------------------------------
    test_data_final[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
    test_data_final.source_node.apply(lambda x: svd(x, U)).apply(pd.Series)
    
    test_data_final[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
    test_data_final.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series)
    #----------------------------------------------------------------------------------------------------------------
    test_data_final[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
    test_data_final.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)

    test_data_final[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
    test_data_final.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)
    #----------------------------------------------------------------------------------------------------------------

    hdf = HDFStore('storage_sample_stage4.h5')
    hdf.put('train_df',train_data_final, format='table', data_columns=True)
    hdf.put('test_df',test_data_final, format='table', data_columns=True)
    hdf.close()
    
else:
    train_data_final = read_hdf('storage_sample_stage4.h5', 'train_df', mode='r')
    test_data_final = read_hdf('storage_sample_stage4.h5', 'test_df', mode='r')

In [3]:
train_data_final.head()

Unnamed: 0,source_node,destination_node,indicator_link,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees,num_followers_s,num_followers_d,num_followees_s,...,svd_v_s_3,svd_v_s_4,svd_v_s_5,svd_v_s_6,svd_v_d_1,svd_v_d_2,svd_v_d_3,svd_v_d_4,svd_v_d_5,svd_v_d_6
0,273084,1505602,1,0,0,0.0,0.0,11,15,6,...,1.983699e-06,1.545081e-13,8.108363e-13,1.719703e-14,-1.355367e-12,4.675311e-13,1.128586e-06,6.616662e-14,9.771073e-13,4.160016e-14
1,1284805,1252082,1,0,0,0.0,0.0,70,222,3,...,3.663967e-05,9.669868e-11,1.141963e-09,1.794619e-12,-2.851073e-12,1.971059e-11,3.528475e-07,6.538768e-11,2.260206e-11,5.010919e-10
2,1764008,713641,1,0,0,0.0,0.0,28,31,9,...,1.402432e-09,3.158536e-12,9.925285e-12,1.664933e-14,-1.68385e-12,8.949512e-12,6.103515e-11,3.31401e-13,1.082855e-12,5.612008e-14
3,462722,1698321,1,0,0,0.0,0.0,1,22,15,...,8.378921e-08,5.215216e-14,2.932388e-13,5.242047e-14,-2.869186e-11,6.766469e-12,4.520495e-05,1.912283e-12,2.725392e-10,1.207092e-12
4,996276,883507,1,0,0,0.019838,0.0,21,24,11,...,7.261455e-06,4.465798e-12,1.410598e-09,9.333835e-13,-1.015656e-09,2.191805e-11,3.120173e-05,5.287823e-12,1.664792e-11,1.907803e-12


- **Preferential Attachement** One well-known concept in social networks is that users with many friends tend to create more connections in the future. This is due to the fact that in some social networks, like in finance, the rich get richer. We estimate how ”rich” our two vertices are by calculating the multiplication between the number of friends (|Γ(x)|) or followers each vertex has. It may be noted that the similarity index does not require any node neighbor information; therefore, this similarity index has the lowest computational complexity.

Ref: http://be.amazd.com/link-prediction/

In [4]:
#preferential attachement for followers
def preferential_followers(df):
    nf_s = np.array(df['num_followers_s'])
    nf_d = np.array(df['num_followers_d'])
    
    preferential_followers = []
    
    for i in range(len(nf_s)):
        preferential_followers.append(nf_d[i]*nf_s[i])
    df['prefer_attach_followers'] = preferential_followers

In [5]:
#preferential attachement for followees
def preferential_followees(df):
    nf_s = np.array(df['num_followees_s'])
    nf_d = np.array(df['num_followees_d'])
    
    preferential_followees=[]
    
    for i in range(len(nf_s)):
        preferential_followees.append(nf_d[i]*nf_s[i])
    df['prefer_attach_followees'] = preferential_followees

In [6]:
#mapping preferential_followers on train and test data
preferential_followers(train_data_final)
preferential_followers(test_data_final)

In [7]:
#mapping preferential_followees on train and test data
preferential_followees(train_data_final)
preferential_followees(test_data_final)

In [9]:
train_data_final.head(2)

Unnamed: 0,source_node,destination_node,indicator_link,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees,num_followers_s,num_followers_d,num_followees_s,...,svd_v_s_5,svd_v_s_6,svd_v_d_1,svd_v_d_2,svd_v_d_3,svd_v_d_4,svd_v_d_5,svd_v_d_6,prefer_attach_followers,prefer_attach_followees
0,273084,1505602,1,0,0,0.0,0.0,11,15,6,...,8.108363e-13,1.719703e-14,-1.355367e-12,4.675311e-13,1.128586e-06,6.616662e-14,9.771073e-13,4.160016e-14,165,48
1,1284805,1252082,1,0,0,0.0,0.0,70,222,3,...,1.141963e-09,1.794619e-12,-2.851073e-12,1.971059e-11,3.528475e-07,6.538768e-11,2.260206e-11,5.010919e-10,15540,9


In [10]:
test_data_final.head(2)

Unnamed: 0,source_node,destination_node,indicator_link,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees,num_followers_s,num_followers_d,num_followees_s,...,svd_v_s_5,svd_v_s_6,svd_v_d_1,svd_v_d_2,svd_v_d_3,svd_v_d_4,svd_v_d_5,svd_v_d_6,prefer_attach_followers,prefer_attach_followees
0,848424,784690,1,0,0,0.029161,0.0,6,6,14,...,4.341594e-13,5.535501e-14,-9.994075e-10,5.79191e-10,3.512361e-07,2.48666e-09,2.771127e-09,1.727693e-12,36,126
1,1549883,966112,1,0,0,0.183702,0.800641,12,12,11,...,-5.035063999999999e-19,3.516028e-19,6.3443199999999995e-19,2.0711550000000002e-18,9.349002e-19,-3.2584429999999996e-19,-4.820282999999999e-19,3.265825e-19,144,143


- **SVD Dot**

SVD Dot is dot product between svd sourse node and svd destination node features.

In [11]:
#svd dot for train datasets

#source_node
u_s1, u_s2, u_s3 = train_data_final['svd_u_s_1'], train_data_final['svd_u_s_2'], train_data_final['svd_u_s_3']
u_s4, u_s5, u_s6 = train_data_final['svd_u_s_4'], train_data_final['svd_u_s_5'], train_data_final['svd_u_s_6']

v_s1, v_s2, v_s3 = train_data_final['svd_v_s_1'], train_data_final['svd_v_s_2'], train_data_final['svd_v_s_3']
v_s4, v_s5, v_s6 = train_data_final['svd_v_s_4'], train_data_final['svd_v_s_5'], train_data_final['svd_v_s_6']
#-------------------------------------------------------------------------------------------------------------
#destination_node
u_d1, u_d2, u_d3 = train_data_final['svd_u_d_1'], train_data_final['svd_u_d_2'], train_data_final['svd_u_d_3']
u_d4, u_d5, u_d6 = train_data_final['svd_u_d_4'], train_data_final['svd_u_d_5'], train_data_final['svd_u_d_6']

v_d1, v_d2, v_d3 = train_data_final['svd_v_d_1'], train_data_final['svd_v_d_2'], train_data_final['svd_v_d_3']
v_d4, v_d5, v_d6 = train_data_final['svd_v_d_4'], train_data_final['svd_v_d_5'], train_data_final['svd_v_d_6']

In [13]:
svd_dot_train = []

for i in range(len(np.array(u_s1))):
    a=[]
    b=[]
    #_u_source_
    a.append(np.array(u_s1[i]))
    a.append(np.array(u_s2[i]))
    a.append(np.array(u_s3[i]))
    a.append(np.array(u_s4[i]))
    a.append(np.array(u_s5[i]))
    a.append(np.array(u_s6[i]))
    #_v_source_
    a.append(np.array(v_s1[i]))
    a.append(np.array(v_s2[i]))
    a.append(np.array(v_s3[i]))
    a.append(np.array(v_s4[i]))
    a.append(np.array(v_s5[i]))
    a.append(np.array(v_s6[i]))
    #_u_destination_
    b.append(np.array(u_d1[i]))
    b.append(np.array(u_d2[i]))
    b.append(np.array(u_d3[i]))
    b.append(np.array(u_d4[i]))
    b.append(np.array(u_d5[i]))
    b.append(np.array(u_d6[i]))
    #_v_destination_
    b.append(np.array(v_d1[i]))
    b.append(np.array(v_d2[i]))
    b.append(np.array(v_d3[i]))
    b.append(np.array(v_d4[i]))
    b.append(np.array(v_d5[i]))
    b.append(np.array(v_d6[i]))
    
    svd_dot_train.append(np.dot(a,b))
    
train_data_final['svd_dot'] = svd_dot_train

In [15]:
train_data_final.head(2)

Unnamed: 0,source_node,destination_node,indicator_link,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees,num_followers_s,num_followers_d,num_followees_s,...,svd_v_s_6,svd_v_d_1,svd_v_d_2,svd_v_d_3,svd_v_d_4,svd_v_d_5,svd_v_d_6,prefer_attach_followers,prefer_attach_followees,svd_dot
0,273084,1505602,1,0,0,0.0,0.0,11,15,6,...,1.719703e-14,-1.355367e-12,4.675311e-13,1.128586e-06,6.616662e-14,9.771073e-13,4.160016e-14,165,48,1.338829e-11
1,1284805,1252082,1,0,0,0.0,0.0,70,222,3,...,1.794619e-12,-2.851073e-12,1.971059e-11,3.528475e-07,6.538768e-11,2.260206e-11,5.010919e-10,15540,9,2.590603e-11


In [16]:
#svd dot for test datasets

#source_node
u_s1, u_s2, u_s3 = test_data_final['svd_u_s_1'], test_data_final['svd_u_s_2'], test_data_final['svd_u_s_3']
u_s4, u_s5, u_s6 = test_data_final['svd_u_s_4'], test_data_final['svd_u_s_5'], test_data_final['svd_u_s_6']

v_s1, v_s2, v_s3 = test_data_final['svd_v_s_1'], test_data_final['svd_v_s_2'], test_data_final['svd_v_s_3']
v_s4, v_s5, v_s6 = test_data_final['svd_v_s_4'], test_data_final['svd_v_s_5'], test_data_final['svd_v_s_6']
#-------------------------------------------------------------------------------------------------------------
#destination_node
u_d1, u_d2, u_d3 = test_data_final['svd_u_d_1'], test_data_final['svd_u_d_2'], test_data_final['svd_u_d_3']
u_d4, u_d5, u_d6 = test_data_final['svd_u_d_4'], test_data_final['svd_u_d_5'], test_data_final['svd_u_d_6']

v_d1, v_d2, v_d3 = test_data_final['svd_v_d_1'], test_data_final['svd_v_d_2'], test_data_final['svd_v_d_3']
v_d4, v_d5, v_d6 = test_data_final['svd_v_d_4'], test_data_final['svd_v_d_5'], test_data_final['svd_v_d_6']

In [17]:
svd_dot_test = []

for i in range(len(np.array(u_s1))):
    a=[]
    b=[]
    #_u_source_
    a.append(np.array(u_s1[i]))
    a.append(np.array(u_s2[i]))
    a.append(np.array(u_s3[i]))
    a.append(np.array(u_s4[i]))
    a.append(np.array(u_s5[i]))
    a.append(np.array(u_s6[i]))
    #_v_source_
    a.append(np.array(v_s1[i]))
    a.append(np.array(v_s2[i]))
    a.append(np.array(v_s3[i]))
    a.append(np.array(v_s4[i]))
    a.append(np.array(v_s5[i]))
    a.append(np.array(v_s6[i]))
    #_u_destination_
    b.append(np.array(u_d1[i]))
    b.append(np.array(u_d2[i]))
    b.append(np.array(u_d3[i]))
    b.append(np.array(u_d4[i]))
    b.append(np.array(u_d5[i]))
    b.append(np.array(u_d6[i]))
    #_v_destination_
    b.append(np.array(v_d1[i]))
    b.append(np.array(v_d2[i]))
    b.append(np.array(v_d3[i]))
    b.append(np.array(v_d4[i]))
    b.append(np.array(v_d5[i]))
    b.append(np.array(v_d6[i]))
    
    svd_dot_test.append(np.dot(a,b))
    
test_data_final['svd_dot'] = svd_dot_test

In [18]:
test_data_final.head(2)

Unnamed: 0,source_node,destination_node,indicator_link,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees,num_followers_s,num_followers_d,num_followees_s,...,svd_v_s_6,svd_v_d_1,svd_v_d_2,svd_v_d_3,svd_v_d_4,svd_v_d_5,svd_v_d_6,prefer_attach_followers,prefer_attach_followees,svd_dot
0,848424,784690,1,0,0,0.029161,0.0,6,6,14,...,5.535501e-14,-9.994075e-10,5.79191e-10,3.512361e-07,2.48666e-09,2.771127e-09,1.727693e-12,36,126,2.083224e-17
1,1549883,966112,1,0,0,0.183702,0.800641,12,12,11,...,3.516028e-19,6.3443199999999995e-19,2.0711550000000002e-18,9.349002e-19,-3.2584429999999996e-19,-4.820282999999999e-19,3.265825e-19,144,143,4.861132e-34


In [19]:
hdf = HDFStore('storage_sample_stage5.h5')
hdf.put('train_df', train_data_final, format='table', data_columns=True)
hdf.put('test_df', test_data_final, format='table', data_columns=True)
hdf.close()