In [59]:
import findspark
findspark.init()

In [60]:
from pyspark import SparkConf, SparkContext

In [61]:
spark_conf = SparkConf().setMaster('local').setAppName('Get Distance between 2 superheroes')
spark_context = SparkContext(conf=spark_conf)

In [67]:
start_sup_id = 885
end_sup_id = 85
hit_counter = spark_context.accumulator(0)
hit_counter

Accumulator<id=10, value=0>

In [68]:
def convert_to_BFS_node(line):

    fields = line.split()
    sup_id = int(fields[0])

    connections = []
    for field in fields[1:]:
        connections.append(int(field))


    color = 'WHITE'
    distance = 99999
    if sup_id == start_sup_id:
        color = 'GRAY'
        distance = 0

    return (sup_id, (connections, color, distance))


def create_data_RDD():
    data = spark_context.textFile('../../data/Marvel_Network/Marvel_Graph')
    return data.map(convert_to_BFS_node)


def make_BFS_map(node):
    sup_id = node[0]
    connections = node[1][0]
    color = node[1][1]
    distance = node[1][2]

    results = []
    if color == 'GRAY':
        for connection in connections:
            new_sup_id = connection
            new_distance = distance + 1
            new_color = 'GRAY'

            if connection == end_sup_id:
                hit_counter.add(1)

            new_entry = (new_sup_id, ([], new_color, new_distance))
            results.append(new_entry)

        color = 'BLACK'

    results.append((sup_id, (connections, color, distance)))
    return results


def BFS_reduce(data1, data2):

    # Compare data between 2 results of the same key.
    edge1 = data1[0]
    edge2 = data2[0]
    color1 = data1[1]
    color2 = data2[1]
    distance1 = data1[2]
    distance2 = data2[2]
    
    distance = 99999
    color = color1
    edges = []

    if len(edge1) > 0:
        edges.extend(edge1)
    elif len(edge2) > 0:
        edges.extend(edge2)

    # Preserve smallest distance
    if distance > distance1:
        distance = distance1
    elif distance > distance2:
        distance = distance2

    # Preserve darkest color
    if (color1 == 'WHITE' and (color2 == 'GRAY' or color2 == 'BLACK')):
        color = color2

    if (color1 == 'GRAY' and color2 == 'BLACK'):
        color = color2

    if (color2 == 'WHITE' and (color1 == 'GRAY' or color1 == 'BLACK')):
        color = color1

    if (color2 == 'GRAY' and color1 == 'BLACK'):
        color = color1

    return (edges, color, distance)


In [69]:
data = create_data_RDD()

for i in range(20):
    print(f'Iteration {i + 1}')

    mapped_data = data.flatMap(make_BFS_map)

    # Note that mapped.count() action here forces the RDD to be evaluated, and
    # that's the only reason our accumulator is actually updated.
    print(f'Processing {mapped_data.count()} values.')

    if (hit_counter.value > 0):
        print(f'Hit the target character! From {hit_counter.value} different direction(s).')
        break

    data = mapped_data.reduceByKey(BFS_reduce)


Iteration 1
Processing 6627 values.
Iteration 2
Processing 16823 values.
Iteration 3
Processing 218092 values.
Hit the target character! From 6 different direction(s).


In [70]:
spark_context.stop()