In [1]:
from __future__ import print_function
from operator import add, itemgetter
import re
import sys
import time
from pyspark.sql import SparkSession


def computeContribs(urls, rank):
    """Calculates URL contributions to the rank of other URLs."""
    num_urls = len(urls)
    for url in urls:
        yield (url, rank / num_urls)


def parseNeighbors(urls):
    """Parses a urls pair string into urls pair."""
    parts = re.split(r'\s+', urls)
    return parts[0], parts[1]



    # Initialize the spark context.
spark = SparkSession\
    .builder\
    .appName("PythonPageRank")\
    .config("spark.driver.memory", "3g")\
    .config("spark.driver.maxResultSize", "3g")\
    .config("spark.executor.memory", "3g")\
    .config("spark.python.worker.memory", "2g")\
    .getOrCreate()

# Loads in input file. It should be in format of:
#     URL         neighbor URL
#     URL         neighbor URL
#     URL         neighbor URL
#     ...
start = time.time()
lines = spark.read.text("InOut/soc-LiveJournal1_2.txt").rdd.map(lambda r: r[0])

# Loads all URLs from input file and initialize their neighbors.
links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache()

# Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
ranks = links.map(lambda url_neighbors: (url_neighbors[0], 1.0))

# Calculates and updates URL ranks continuously using PageRank algorithm.
for iteration in range(10):
    # Calculates URL contributions to the rank of other URLs.
    contribs = links.join(ranks).flatMap(
        lambda url_urls_rank: computeContribs(url_urls_rank[1][0], url_urls_rank[1][1]))

    # Re-calculates URL ranks based on neighbor contributions.
    ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15)
ranks = ranks.sortBy(itemgetter(1), ascending=False)
# Collects all URL ranks and dump them to console.
for (link, rank) in ranks.take(10):
    print("%s has rank: %s." % (link, rank))
end = time.time()
print ((end-start)/60)
spark.stop()

8737 has rank: 639.8010490902625.
2914 has rank: 558.8945949001937.
18964 has rank: 382.13256008828665.
1220 has rank: 337.657728851824.
2409 has rank: 316.2258751375721.
10029 has rank: 308.2107822787295.
214538 has rank: 289.95154392199106.
7343 has rank: 276.70505934087265.
39295 has rank: 254.98966135869065.
18963 has rank: 253.3228530923532.
39.265833783149716
