In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.4-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.2.4-bin-hadoop3.2'

In [None]:
#importing libraries
from pyspark import SparkContext
from math import radians, sin, cos, sqrt, atan2

In [7]:
sc = SparkContext.getOrCreate()
data_path = "/content/Capitals.txt"

#Extracting the data in a useful manner
def extract_data(entry):
    components = entry.split("\t")
    country, capital = components[1], components[2]
    lat, lon = map(lambda x: float(x.replace(',', '.')), [components[3], components[4]])
    return (country, capital, lat, lon)

parsed_data = sc.textFile(data_path).map(extract_data)

In [8]:
#Computing the distances
def compute_distance(data1, data2):
    R = 6371.0  # Earth's radius in km
    lat1, lon1 = radians(data1[2]), radians(data1[3])
    lat2, lon2 = radians(data2[2]), radians(data2[3])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    distance = R * 2 * atan2(sqrt(a), sqrt(1 - a))
    return distance

combinations = parsed_data.cartesian(parsed_data).filter(lambda pair: pair[0][1] != pair[1][1])

distances = combinations.map(lambda pair: ((pair[0][1], pair[1][1]), compute_distance(pair[0], pair[1])))


In [9]:
#Finding farthest cities
max_dist_pair = distances.max(key=lambda item: item[1])

print(f"The two farthest capital cities are {max_dist_pair[0][0]} and {max_dist_pair[0][1]}, and the distance between them is {max_dist_pair[1]:.2f} km.")


The two farthest capital cities are Asunción  and Taipei , and the distance between them is 19927.06 km.
