In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [28]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.4-bin-hadoop3.2"

In [29]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.2.4-bin-hadoop3.2'

In [35]:
# İmporting libraries
import math
from pyspark import SparkContext

In [36]:
# Initialize Spark context
spark_context = SparkContext.getOrCreate()
capital_data = spark_context.textFile("/content/Capitals.txt")
print(capital_data.first())

Afghanistan Flag Icon 	Afghanistan 	Kabul 	34,53 	69,17	


In [37]:
# Extracting required information from each line
def get_info_from_line(line):
    split_data = line.split("\t")
    country_name = split_data[1]
    city_name = split_data[2]
    lat_parts = split_data[3].split(",")
    lon_parts = split_data[4].split(",")
    latitude = float(lat_parts[0]) + float(lat_parts[1])
    longitude = float(lon_parts[0]) + float(lon_parts[1])
    return (country_name, city_name, latitude, longitude)

parsed_data = capitals_data.map(get_info_from_line)
print(parsed_data.take(3))

[('Afghanistan ', 'Kabul ', 87.0, 86.0), ('Albania ', 'Tirana ', 74.0, 101.0), ('Algeria ', 'Algiers ', 111.0, 7.0)]


In [38]:
# Generate city pairs
city_combinations = parsed_data.cartesian(parsed_data)
different_city_pairs = city_combinations.filter(lambda pair: pair[0][1] != pair[1][1])
print(different_city_pairs.take(3))
print(different_city_pairs.count())


[(('Afghanistan ', 'Kabul ', 87.0, 86.0), ('Albania ', 'Tirana ', 74.0, 101.0)), (('Afghanistan ', 'Kabul ', 87.0, 86.0), ('Algeria ', 'Algiers ', 111.0, 7.0)), (('Afghanistan ', 'Kabul ', 87.0, 86.0), ('American Samoa ', 'Pago Pago ', 14.0, -100.0))]
57838


In [39]:
# Calculate distance between cities
def calculate_distance(city_pair):
    city1 = city_pair[0]
    city2 = city_pair[1]

    lat1 = city1[2]
    lon1 = city1[3]
    lat2 = city2[2]
    lon2 = city2[3]

    d_lat = lat2 - lat1
    d_lon = lon2 - lon1
    #Haversine formula taken from the internet
    a_val = math.sin(d_lat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(d_lon/2)**2
    c_val = 2 * math.atan2(math.sqrt(a_val), math.sqrt(1-a_val))
    distance_in_km = 6371 * c_val
    return ((city1[1], city2[1]), distance_in_km)

city_distances = different_city_pairs.map(calculate_distance)
closest_pair = city_distances.min(key=lambda x: x[1])

print("The two closest capital cities are {} and {}, and the distance between them is {:.2f} unit.".format(closest_pair[0][0], closest_pair[0][1], closest_pair[1]))


The two closest capital cities are Jerusalem  and East Jerusalem , and the distance between them is 0.00 unit.
