In [2]:
import numpy as np
from  scipy import sparse

#store matrix G in a compressed sparse matrix format.  P = (D)^-1 * G
#Note that sources contain the "to" nodes and the targetumns contain "from" nodes

#Observe len(nodes) + 1 on the return statement that adds one node to the graph
def adding_node():
    source = []
    target = [] 
    source_target = []
    num_of_edges =0
    nodes = set([])
    with open("stanweb.dat",'r') as f:
        for line in f.readlines():
            lst = line.split()
            node1, node2, weights = int(lst[0])-1,int(lst[1])-1,float(lst[2])
            source.append(node1) #storing "to" nodes on sources of G
            target.append(node2) #storing "from" nodes on targetumns of G
            num_of_edges += 1 
            nodes.add(node1)
            nodes.add(node2)
          
           
            
    return sparse.csr_matrix(([1]*num_of_edges,(target,source)),shape=(len(nodes)+1,len(nodes)+1))


def construct_csr_matrix():
    source = []
    target = [] 
    source_target = []
    num_of_edges =0
    nodes = set([])
    weight_dict = {}
    with open("stanweb.dat",'r') as f:
        for line in f.readlines():
            lst = line.split()
            node1, node2, weights = int(lst[0])-1,int(lst[1])-1,float(lst[2])
            source.append(node1) #storing "to" nodes on sources of G
            target.append(node2) #storing "from" nodes on targetumns of G
            num_of_edges += 1 
            
            nodes.add(node1)
            nodes.add(node2)
      
    return sparse.csr_matrix(([1]*num_of_edges,(target, source)),shape = (len(nodes) + 1,len(nodes) + 1)) 


In [9]:
#Construct initial matrix
G = construct_csr_matrix()

#Construct matrix with added node
G_plus = adding_node()

#Dimensions
n = G.shape[0]
n_plus = G_plus.shape[0]

In [10]:
#The algorithm is based on the formula rank = alpha *ranks/out_degree + (1−alpha)/n
def PageRank(G,n, a=0.85, t = 10**-8):
    
    d = G.sum(axis=0).T #calculate number of outlinks in each node-webpage
    
    #Initialization
    fast_conv = []
    ranks = np.ones((n,1))/n #vector
    iterations = 0
    error = 1
    
    #Iterating till convergence
    while error > t:       
        
        #try except is used to ignore division with 0 as some outdegrees are zero
        try:
            new_ranks = G.dot((a*ranks)/ d)  #G.dot(1/d) is the P hat matrix.
        except (ZeroDivisionError): 
            pass  
        
        #Add the second part of the formula used to correct dangling nodes
        new_ranks += (1-a)/n
    
        #Check which nodes converge on iteration 1
        iterations +=1
        if iterations<2:
            for i in range(len(ranks)):
                if abs(ranks[i]-new_ranks[i])< t:
                    fast_conv.append([i])
                    
        #Stop condition
        error = np.linalg.norm(ranks-new_ranks)/np.linalg.norm(ranks)  
        ranks = new_ranks
    return new_ranks, iterations,fast_conv

#### Question a

##### Experimental 

In [6]:
#Calculate Pagerank
ranks,_,_ = PageRank(G,n, a=0.85, t = 10**-8)
ranks_plus,_,_ = PageRank(G_plus,n_plus, a=0.85, t = 10**-8)

#Find indices without the new node
ranks = np.asarray(ranks).ravel()
indices = ranks.argsort()[-n:][::-1]

#Find indices with the new node
ranks_plus = np.asarray(ranks).ravel()
indices_new = ranks_plus.argsort()[-n_plus:][::-1]

print("X rank_score:",ranks_plus[-1])


#Compare ranks before and after the addition of the new node
Flag = True
for i in range(n):
    if indices[i] != indices_new[i]:
        print("The ranks are changed")
        Flag = False
        
if Flag==True:
    print("The ranks before and after the addition of the new node are the same though the ranking_score has slightly change")



X rank_score: 5.32096032692e-07
The ranks before and after the addition of the new node are the same though the ranking_score has slightly change


#### Numerical Calculation

The formula of the ranking :
rj = Σβ*ri/di + (1-b)/n , where i are the nodes that have links to j

The ranking of the new node is:
r281904 = (1-0.85) / 281904 = 5,320960326919802e-7

As the new node has no outlinks, the rank of i of any i node has the same inlinks as before. Also the addition of one node on a collection of 281903 nodes is too small to affect the ranking of the nodes. So the rankings remain unchanged.


#### Question b

Unsatisﬁed with the PageRank of your page X; you create another page Y (with no in-links) that links to X: What are the PageRanks of all the n + 2 pages now? Does the PageRank of X improve?

In [11]:
def adding_node_Y():
    source = []
    target = [] 
    source_target = []
    num_of_edges =0
    nodes = set([])
    weight_dict = {}
    with open("stanweb.dat",'r') as f:
        for line in f.readlines():
            lst = line.split()
            node1, node2, weights = int(lst[0])-1,int(lst[1])-1,float(lst[2])
            source.append(node1) 
            target.append(node2) 
            num_of_edges += 1 
            nodes.add(node1)
            nodes.add(node2)
        
        #Adding edge of Y
        nodes.add(len(nodes))
        nodes.add(len(nodes)+1)
        
        source.append(len(nodes)) #Y =  one node before the end
        target.append(len(nodes)+1) #X=last node
        
        num_of_edges += 1
                
    return sparse.csr_matrix(([1]*num_of_edges,(target,source)),shape=(len(nodes)+2,len(nodes)+2)) 

In [13]:
# Create graph containing X,Y
G_plus_Y = adding_node_Y()

#Dimensions
nY = G_plus_Y.shape[0]

#Calculate Pagerank
ranks_plus_Y , _ ,_ = PageRank(G_plus_Y, nY, a=0.85, t = 10**-8)

#Find indices with node X
ranks_plus = np.asarray(ranks_plus).ravel()
indices_new = ranks_plus.argsort()[-n_plus:][::-1]

#Find indices with node X and Y
ranks_plus_Y = np.asarray(ranks_plus_Y).ravel()
indices_Y = ranks_plus_Y.argsort()[-nY:][::-1]

#Compare all ranks before and after the addition of the new node
Flag = True
count_changes = 0
for i in range(n_plus-1):
    if indices_new[i] != indices_Y[i]:
        count_changes += 1
        Flag = False
        
if Flag==True:
    print("The ranks before and after the addition of the new node are the same")
else:
    print("The number of ranks changes is:", count_changes)
    
#Compare X rank with and without Y
rank_x_prev = ranks_plus[-1]
rank_x_after = ranks_plus_Y[-1]


print("The rank of X before the addition of y is:",rank_x_prev)
print("The rank of X after the addition of y is:", rank_x_after)




The number of ranks changes is: 182967
The rank of X before the addition of y is: 5.32096032692e-07
The rank of X after the addition of y is: 9.84367184923e-07


So as observed the rank_score of X after the insertion of Y is increased. Also the ranks of most of the nodes are changed.

#### Question c: 

Still unsatisﬁed, you create a third page Z: How should you set up the links on your three pages so as to maximize the PageRank of X?


After inserting Z in order to maximize the rank of X we would have to insert only one link on the webpage of Z to X. If we add more links from Z to other nodes the value of every link on Z will be segmented into the number of links in Z and thus X will receive less rank score.
Y also links only X

##### Implementation of the above idea

In [84]:
def Adding_Node_Z():
    source = []
    target = [] 
    source_target = []
    num_of_edges =0
    nodes = set([])
    weight_dict = {}
    with open("stanweb.dat",'r') as f:
        for line in f.readlines():
            lst = line.split()
            node1, node2, weights = int(lst[0])-1,int(lst[1])-1,float(lst[2])
            source.append(node1) 
            target.append(node2) 
            num_of_edges += 1 
            nodes.add(node1)
            nodes.add(node2)
        
        
        
        
        #Adding edge of Y
        nodes.add(281903) #Y = 281903
        nodes.add(281904) #Z = 281904
        nodes.add(281905) #X = 281905
        
        
        source.append(281903)
        target.append(281905)
        
        source.append(281904)
        target.append(281905)
        
        num_of_edges += 2
       
    return sparse.csr_matrix(([1]*num_of_edges,(target,source)),shape=(len(nodes) + 3,len(nodes) + 3))

It holds that the more the links in a website the less the value of each link for the targeted page.
So in order to maximize the value of X we will introduce one link from Y to X and one link from Z to X.

In [16]:
G_plus_ZY = Adding_Node_Z()
n_ZY = G_plus_ZY.shape[0]

#Calculate Pagerank
ranks_plus_Y , _ ,_ = PageRank(G_plus_ZY, n_ZY, a=0.85, t = 10**-8)

#Find indices with node X
ranks_plus_ZY = np.asarray(ranks_plus).ravel()

print("The rank of X before introducing Z is:", float(ranks_plus_Y[-1]))
print("The rank of X after introducing Z is:", ranks_plus_ZY[-1])



The rank of X before introducing Z is: 5.320865953197664e-07
The rank of X after introducing Z is: 5.32096032692e-07


Observing the results we conclude that adding a link from any page to X increases the X rank but if the webpage is not important the increase in X is small.

#### Question d: 

You have one last idea, you add links from your page X to older, popular pages (e.g.: you add a list of ?Useful links? on your page). Does this improve the PageRank of X? Does the answer change if you add links from Y or Z to older, popular pages? 

##### As power method is a popularity measure I add a link of X  to the top 20 top ranked webpages of the previous question(c)

In [91]:
#Function that adds links of top 20 ranked webpages to the webpage of X
def Add_links_to_X(source_lst):
    source = []
    target = [] 
    num_of_edges =0
    nodes = set([])
    weight_dict = {}
    with open("stanweb.dat",'r') as f:
        for line in f.readlines():
            lst = line.split()
            node1, node2, weights = int(lst[0])-1,int(lst[1])-1,float(lst[2])
            source.append(node1) 
            target.append(node2) 
            num_of_edges += 1 
            nodes.add(node1)
            nodes.add(node2)
        
        #Adding edge of Y and Z
        nodes.add(281903)
        nodes.add(281904)
        nodes.add(281905)
        
        #Links from Y , Z to X
        source.append(281903)
        target.append(281905)
        
        source.append(281904)
        target.append(281905)
        
        num_of_edges += 2
        
        #Add links to source pages linking to the target page
        source.extend([len(nodes)]*len(source_lst))
        target.extend(source_lst)
        num_of_edges += len(source_lst)
        
       
        
    return sparse.csr_matrix(([1]*num_of_edges,(target,source)),shape=(len(nodes)+3,len(nodes)+3))

In [92]:
#Find top 20 links using power method after the addition of Y, Z
indices_plusZY = ranks_plus_ZY.argsort()[-n_ZY:][::-1]

#Add link to X from top 20 ranked pages
G_d = Add_links_to_X(indices_plusZY[:20])

ranks_d , _ ,_ = PageRank(G_d, G_d.shape[0], a=0.85, t = 10**-8)
ranks_d = np.asarray(ranks_d).ravel()
print("The rank of X before introducing links is:", ranks_plus_ZY[-1])
print("The rank of X after introducing the links is:", ranks_d[-1])



The rank of X before introducing links is: 5.32096032692e-07
The rank of X after introducing the links is: 5.3208659532e-07


##### Adding links of Y to top ranked pages of previous question 

In [95]:
#Function that adds links of Y to the 20 top ranked websites of question c
def Add_links_of_Y(source_lst):
    source = []
    target = [] 
    num_of_edges =0
    nodes = set([])
    weight_dict = {}
    with open("stanweb.dat",'r') as f:
        for line in f.readlines():
            lst = line.split()
            node1, node2, weights = int(lst[0])-1,int(lst[1])-1,float(lst[2])
            source.append(node1) 
            target.append(node2) 
            num_of_edges += 1 
            nodes.add(node1)
            nodes.add(node2)
        
        #Adding edge of Y and Z
        nodes.add(281903)
        nodes.add(281904)
        nodes.add(281905)
        
        #Links from Y , Z to X
        source.append(281903)
        target.append(281905)
        
        source.append(281904)
        target.append(281905)
        
        num_of_edges += 2
        
        #Add links to source pages linking to the target page
        source.extend([281904]*len(source_lst))
        target.extend(source_lst)
        num_of_edges += len(source_lst)
          
    return sparse.csr_matrix(([1]*num_of_edges,(target,source)),shape=(len(nodes)+3,len(nodes)+3))

In [96]:
#Add link to X from top 20 ranked pages
G_d = Add_links_of_Y(indices_plusZY[:20])

ranks_d , _ ,_ = PageRank(G_d, G_d.shape[0], a=0.85, t = 10**-8)
ranks_d = np.asarray(ranks_d).ravel()
print("The rank of X before introducing links is:", ranks_plus_ZY[-1])
print("The rank of X after introducing the links is:", ranks_d[-1])



The rank of X before introducing links is: 5.32096032692e-07
The rank of X after introducing the links is: 5.3208659532e-07


So it seems that introducing more links to X or to Y does not change the outcome of the power method. The rankings remain the same.

#### Question e 

Describe what steps you might take to raise the PageRank of X further. You do not need to prove anything here, just summarize your thoughts based on the previous parts. For extra credit though, you can prove what the structure for a link farm with m nodes should be to optimize the PageRank of X.


In question b,c we added links from Y to X and from Z to X and observed that this action increased the ranking of X. We also know from bibliography that if the pages that contain links to X are top ranked then this will add more value to the ranking of X. Therefore we have to work towards the creation of links of X into other top ranked pages. It is important to highlight that the addition of links of other pages into our page X as done in question d  did not contribute to the ranking of X.