In [3]:
from pyspark import SparkConf, SparkContext
import findspark
findspark.init('/Users/amogh/spark-2.4.4-bin-hadoop2.7/')

In [7]:
conf = SparkConf().setMaster("local").setAppName("PageRank")
sc = SparkContext(conf=conf)

In [19]:
data = [(1,2),(1,3),(1,4),(3,2),(2,4),(4,3)]
data_rdd = sc.parallelize(data)

In [21]:
adj_list = data_rdd.groupByKey().mapValues(list).collect()
adj_list

[(1, [2, 3, 4]), (3, [2]), (2, [4]), (4, [3])]

In [8]:
from __future__ import print_function

import re
import sys
from operator import add

from pyspark.sql import SparkSession


def computeContribs(urls, rank):
    """Calculates URL contributions to the rank of other URLs."""
    num_urls = len(urls)
    for url in urls:
        yield (url, rank / num_urls)


def parseNeighbors(urls):
    """Parses a urls pair string into urls pair."""
    parts = re.split(r'\s+', urls)
    return parts[0], parts[1]


if __name__ == "__main__":

    # Initialize the spark context.
    spark = SparkSession\
          .builder\
          .appName("PageRank")\
          .getOrCreate()

    # Loads in input file. It should be in format of:
    #     URL         neighbor URL
    #     URL         neighbor URL
    #     URL         neighbor URL
    #     ...
    lines = spark.read.text('url2.txt').rdd.map(lambda r: r[0])

    # Loads all URLs from input file and initialize their neighbors.
    links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache()

    # Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    ranks = links.map(lambda url_neighbors: (url_neighbors[0], 0))

    # Calculates and updates URL ranks continuously using PageRank algorithm.
    for iteration in range(1):
        # Calculates URL contributions to the rank of other URLs.
        contribs = links.join(ranks).flatMap(
            lambda url_urls_rank: computeContribs(url_urls_rank[1][0], url_urls_rank[1][1]))

        # Re-calculates URL ranks based on neighbor contributions.
        new_ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15)

        # Collects all URL ranks and dump them to console.
        for (link, rank) in new_ranks.collect():
            print("%s has rank: %s." % (link, rank))
        
        # Calculates URL contributions to the rank of other URLs.
        contribs = links.join(new_ranks).flatMap(
            lambda url_urls_rank: computeContribs(url_urls_rank[1][0], url_urls_rank[1][1]))

        # Re-calculates URL ranks based on neighbor contributions.
        new_ranks2 = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15)

        # Collects all URL ranks and dump them to console.
        for (link, rank) in new_ranks2.collect():
            print("%s has rank: %s." % (link, rank))
        

    spark.stop()

C has rank: 0.15.
B has rank: 0.15.
D has rank: 0.15.
E has rank: 0.15.
F has rank: 0.15.
A has rank: 0.15.
C has rank: 0.181875.
D has rank: 0.30937499999999996.
B has rank: 0.59625.
A has rank: 0.21375.
E has rank: 0.181875.
F has rank: 0.181875.
