<a href="https://colab.research.google.com/github/EonTechie/Big_Data_Processing_Spark_Projects/blob/main/spark-rdd-tasks/SeismicEventCorrelation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Filiz-Yıldız-Part1-Question2
"""
Dataset: Capitals.txt
Goal: Find the two capital cities that are the furthest apart.

My Approach:
This problem was very similar to Question 1, but instead of finding the minimum distance, I searched for the maximum.
I reused the same logic to compute pairwise distances using the Haversine formula via RDD and applied max() to extract the furthest pair.
Again, all was done purely with RDD operations — no DataFrames used as requested.


"""
# Connect colab to my drive account to fetch the dataset stored there
from google.colab import drive
drive.mount('/content/drive')

# Print files to see the namesof all (optional)
import os
folder_path = "/content/drive/My Drive/datasets"
files = os.listdir(folder_path)
print(files)

Mounted at /content/drive
['2.txt', 'Capitals.txt', 'EartquakeData-07032025.txt', 'DollarDataset.txt', 'couples.txt', 'join-actors.txt', 'points-null-values.txt', 'numbers-test.txt', 'join-series.txt', 'points.txt', 'names.txt', 'Lottery.txt', 'JamesJoyce-Ulyses.txt', 'world.txt', 'points-places.txt', 'Iris.csv', 'ml-latest-small']


In [None]:
# Import SparkContext class to create Spark application
from pyspark import SparkContext

# Create Spark Object to start Spark application, sc is a SparkContext object
sc = SparkContext()

In [None]:
# Parse a line from the Capitals.txt file and extract the capital name and coordinates
def parse_line(line):
    # Split the line using the tab character (\t) since the dataset uses tab-separated values
    parts = line.split('\t')

    # Check if the line has at least 5 parts for correct format: we need at least capital name, latitude, and longitude
    if len(parts) >= 5:
        # Extract the capital city name from the 3rd column (index 2)
        name = parts[2]

        # Extract latitude and longitude from the 4th and 5th columns (index 3 and 4)
        # Replace ',' with '.' because the data uses comma for decimals (e.g., 34,53)
        # but Python's float() function expects dots (e.g., 34.53)
        lat = float(parts[3].replace(',', '.'))
        lon = float(parts[4].replace(',', '.'))

        # Return a tuple: (capital_name, (latitude, longitude))
        # This format makes it easy to use later in distance calculations
        return (name, (lat, lon))

    # If the line is invalid (missing columns), return None
    # This allows us to easily filter out bad data later using filter()
    return None


In [None]:
# Read Capitals.txt with sc, return an RDD
# map: apply parse_line function to each line (transformation)
# filter: remove None values (transformation)

capitals_rdd = sc.textFile("/content/drive/My Drive/datasets/Capitals.txt") \
    .map(parse_line) \
    .filter(lambda x: x is not None)

# take: show first 5 elements (action)
capitals_rdd.take(5)


[('Kabul ', (34.53, 69.17)),
 ('Tirana ', (41.33, 19.82)),
 ('Algiers ', (36.75, 3.04)),
 ('Pago Pago ', (-14.28, -170.7)),
 ('Andorra la Vella ', (42.51, 1.52))]

In [None]:
# Create all possible pairs of capital cities using the cartesian product (Trasformation)
# This will create pairs like: ((CityA, (latA, lonA)), (CityB, (latB, lonB)))
pairs = capitals_rdd.cartesian(capitals_rdd).filter(lambda x: x[0][0] != x[1][0])
# Filter out pairs where both cities are the same (e.g., (Paris, Paris)) (Transformaiton)
# We only want pairs of *different* cities

# Show the first 5 valid city pairs (for checking) (action)
pairs.take(5)

[(('Kabul ', (34.53, 69.17)), ('Tirana ', (41.33, 19.82))),
 (('Kabul ', (34.53, 69.17)), ('Algiers ', (36.75, 3.04))),
 (('Kabul ', (34.53, 69.17)), ('Pago Pago ', (-14.28, -170.7))),
 (('Kabul ', (34.53, 69.17)), ('Andorra la Vella ', (42.51, 1.52))),
 (('Kabul ', (34.53, 69.17)), ('Luanda ', (-8.84, 13.23)))]

In [None]:
import math

# Haversine formula to calculate the great-circle distance between two coordinates on Earth
def haversine(coord1, coord2):
    R = 6371  # Radius of Earth in kilometers

    # Unpack latitude and longitude from both coordinates
    lat1, lon1 = coord1
    lat2, lon2 = coord2

    # Convert degrees to radians since math functions use radians
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    d_phi = math.radians(lat2 - lat1)        # difference in latitude
    d_lambda = math.radians(lon2 - lon1)     # difference in longitude

    # Apply the haversine formula
    a = math.sin(d_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(d_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Return the distance in kilometers
    return R * c


In [None]:
# Map each city pair to a tuple: ((city1_name, city2_name), distance_in_km)
# Use the haversine function to calculate the distance between their coordinates using map (Transformation)
distances = pairs.map(lambda x: ((x[0][0], x[1][0]), haversine(x[0][1], x[1][1])))

# Show the first 5 results to verify the distances (Action)
distances.take(5)


[(('Kabul ', 'Tirana '), 4334.696822428333),
 (('Kabul ', 'Algiers '), 5857.721358797607),
 (('Kabul ', 'Pago Pago '), 13646.126711152365),
 (('Kabul ', 'Andorra la Vella '), 5795.92545456428),
 (('Kabul ', 'Luanda '), 7601.072171051554)]

In [None]:
# Find the city pair with the minimum distance using the distance value (x[1]) as the key
# This is an action that triggers RDD computation and returns the result to the driver (as a Python object)
farthest = distances.max(key=lambda x: x[1])

# Print the closest pair of capital cities and their distance
print("Farthest cities are", farthest[0][0], "and", farthest[0][1], "with", farthest[1] , "km")

Farthest cities are Asunción  and Taipei  with 19927.05728016978 km
