# **2.Loading a real graph**

(a) Create a transition matrix given the links in the datasets (ignore
the value column for this).

In [21]:
import pandas as pd
import networkx as nx
import numpy as np



In [22]:
from google.colab import files
 
uploaded = files.upload()

#please choose both given files(stack_network_links.csv and stack_network_nodes.csv)after executing this block from your local drive

Saving stack_network_links.csv to stack_network_links (1).csv
Saving stack_network_nodes.csv to stack_network_nodes (1).csv


In [23]:
# Reading the nodes and links data from the Excel files using pandas
df1 = pd.read_csv('stack_network_links.csv')
df2 = pd.read_csv('stack_network_nodes.csv')

In [24]:
# Creating a directed graph using NetworkX
G = nx.from_pandas_edgelist(df1, source = 'source', target = 'target' , create_using=nx.DiGraph())


In [25]:
# Get the nodes list from the graph
nodes = list(G.nodes())

In [26]:
# Creating an empty transition matrix
transition_matrix = np.zeros((len(nodes), len(nodes)))

In [27]:
# Calculate the unweighted transition matrix as we are told to ignore values given in dataset
for edge in G.edges():
    source = edge[0]
    target = edge[1]
    source_idx = nodes.index(source)
    target_idx = nodes.index(target)
    transition_matrix[source_idx][target_idx] = 1


In [32]:
# Normalize the transition matrix 
transition_matrix = transition_matrix / np.sum(transition_matrix, axis=1, keepdims=True)


In [33]:
print(transition_matrix)

[[0.         0.2        0.         ... 0.         0.         0.        ]
 [0.125      0.         0.125      ... 0.         0.         0.        ]
 [0.         0.11111111 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]]


(b) Calculate the PageRank for all nodes in the graph using the code 
from Task 1.

In [34]:
# Calculate the PageRank using power iteration
damping_factor = 0.85
max_iterations = 100
tolerance = 1e-6

In [35]:
# Initialize the PageRank vector
pagerank = np.ones(len(nodes)) / len(nodes)

In [36]:
# Power iteration algorithm
for i in range(max_iterations):
    new_pagerank = (1 - damping_factor) / len(nodes) + damping_factor * transition_matrix.T.dot(pagerank)
    if np.sum(np.abs(new_pagerank - pagerank)) < tolerance:
        break
    pagerank = new_pagerank

In [37]:
# Print the PageRank for all nodes
for i, node in enumerate(nodes):
    print(f"Node {node}: {pagerank[i]}")

Node azure: 0.008564156109744194
Node .net: 0.0114333028321687
Node sql-server: 0.01303294317532656
Node asp.net: 0.019049487489072005
Node entity-framework: 0.01124394739699259
Node wpf: 0.008725224609106939
Node linq: 0.009980292956852148
Node wcf: 0.01124394739699259
Node c#: 0.022388838131462724
Node tdd: 0.008695652173913044
Node agile: 0.008695652173913044
Node codeigniter: 0.009125614871472188
Node ajax: 0.010257644175468688
Node jquery: 0.022057748941829734
Node mysql: 0.016357835121026315
Node css: 0.019722858417345977
Node php: 0.013932695505108495
Node javascript: 0.01592347202944594
Node json: 0.008070717963165136
Node cloud: 0.0034980751965213103
Node amazon-web-services: 0.010323416952886537
Node devops: 0.00796444979436024
Node docker: 0.010979763664343958
Node ios: 0.012722776589089647
Node android: 0.007755603203603871
Node android-studio: 0.0035017717019047846
Node java: 0.015739738676489784
Node typescript: 0.0076641289775724675
Node angular: 0.004561608422356289
Nod

# **3.Topic-sensitive PageRank:**

(a) Implement a solution that calculates the topic-sensitive PageRank
for a given node in the graph and apply it to the graph you created
in Task 2.

To calculate the topic-sensitive PageRank for a given node in the graph, we need to modify the power iteration algorithm to incorporate the topic-specific taxation and here we create function calculate_tspr so we can check for different values.

In [38]:
def calculate_tspr(transition_matrix, nodes_df, links_df, topic_node, target_node, damping_factor=0.85, teleportation_prob=0.1,
                   max_iterations=100, tolerance=1e-6):
    # Creating a directed graph using NetworkX and finding transition matrix as we did before
    G = nx.from_pandas_edgelist(df1, source='source', target='target', create_using=nx.DiGraph())
    nodes = list(G.nodes())
    transition_matrix = np.zeros((len(nodes), len(nodes)))
    for edge in G.edges():
        source = edge[0]
        target = edge[1]
        source_idx = nodes.index(source)
        target_idx = nodes.index(target)
        transition_matrix[source_idx][target_idx] = 1

    transition_matrix = transition_matrix / np.sum(transition_matrix, axis=1, keepdims=True)

    #Now Calculating the Topic-Sensitive PageRank using power iteration

    # Initialize the PageRank vector
    pagerank = np.ones(len(nodes)) / len(nodes)

    # Power iteration algorithm
    for _ in range(max_iterations):
        new_pagerank = ((1 - damping_factor) / len(nodes)) * np.ones(len(nodes))

        for i, node in enumerate(nodes):
            if node == topic_node:
                neighbors = list(G.successors(node))
                num_neighbors = len(neighbors)
                if num_neighbors > 0:
                    topic_prob = teleportation_prob / num_neighbors
                    for neighbor in neighbors:
                        neighbor_idx = nodes.index(neighbor)
                        new_pagerank[neighbor_idx] += damping_factor * topic_prob * pagerank[i]
            else:
                new_pagerank[i] += damping_factor * transition_matrix[:, i].dot(pagerank)

        if np.sum(np.abs(new_pagerank - pagerank)) < tolerance:
            break

        pagerank = new_pagerank

    # Finding the TSPR value for the target node in the topic node
    target_node_idx = nodes.index(target_node)
    tspr_value = pagerank[target_node_idx]

    return tspr_value


(b) Output the topic-sensitive PageRank (TSPR) for each of the following nodes:

    TSPR('css', 'angularjs'),
    TSPR('angularjs', 'css'),
    TSPR('jquery', 'bootstrap'),
    TSPR('bash', 'linux')


In [40]:
# Specify the topic and target nodes as a list of tuples
topic_target_nodes = [
    ('css', 'angular'),
    ('angularjs', 'css'),
    ('jquery', 'bootstrap'),
    ('bash', 'linux')]

# Calculate TSPR values for each target and topic combination
for topic_node, target_node in topic_target_nodes:
    tspr_value = calculate_tspr(transition_matrix,df1, df2, topic_node, target_node)
    print(f"TSPR value for '{target_node}' in the topic '{topic_node}': {tspr_value}")

TSPR value for 'angular' in the topic 'css': 0.004461917250598985
TSPR value for 'css' in the topic 'angularjs': 0.015425842835263616
TSPR value for 'bootstrap' in the topic 'jquery': 0.0022157341601519573
TSPR value for 'linux' in the topic 'bash': 0.019402415140041356


(c) For each of the topics above (css, angularjs, jquery, bash): output
the top 5 nodes and their respective TS-PR value.


In [44]:
# Calculate TS-PR values for each topic
topics = ['css', 'angularjs', 'jquery', 'bash']
for topic in topics:
    topic_node = topic.lower()

    # Calculate TS-PR values
    tspr_values = calculate_tspr(transition_matrix, df1,df2,target_node,topic_node,nodes.index(topic_node))

   
    
  # Convert tspr_values to a list if it is a single value
    if isinstance(tspr_values, float):
        tspr_values_list = [tspr_values]
    else:
        tspr_values_list = tspr_values.tolist()

    # Get the top 5 nodes and their TS-PR values
    top_nodes = sorted(range(len(tspr_values_list)), key=lambda i: tspr_values_list[i], reverse=True)[:5]

    # Print the results
    print(f"Top 5 nodes and their TSPR values for the topic '{topic}':")
    for node_idx in top_nodes:
        node = nodes[node_idx]
        tspr_value = tspr_values_list[node_idx]
        print(f"Node: {node}, TSPR value: {tspr_value}")
    print()

Top 5 nodes and their TSPR values for the topic 'css':
Node: azure, TSPR value: -3.100200044683475e+113

Top 5 nodes and their TSPR values for the topic 'angularjs':
Node: azure, TSPR value: -3.5038868694191703e+143

Top 5 nodes and their TSPR values for the topic 'jquery':
Node: azure, TSPR value: -2.199839349733303e+107

Top 5 nodes and their TSPR values for the topic 'bash':
Node: azure, TSPR value: -1.2086776596155856e+165

