## Task 1 - Query 1

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("COVID-19").getOrCreate()
people_large_df = spark.read.csv("PEOPLE-large.csv", header=True, inferSchema=True)
infected_small_df = spark.read.csv("INFECTED-small.csv", header=True, inferSchema=True)
contact_range = 6.0
people_large_rdd = people_large_df.select("id", "x", "y").rdd.map(lambda row: (row.id, (row.x, row.y)))
infected_small_rdd = infected_small_df.select("id", "x", "y").rdd.map(lambda row: (row.id, (row.x, row.y)))

# Applying cartesian product to find all pairs of infected and non-infected
all_pairs = infected_small_rdd.cartesian(people_large_rdd)
close_contact_pairs = all_pairs.filter(
    lambda pair: (pair[0][1][0] - pair[1][1][0]) ** 2 + (pair[0][1][1] - pair[1][1][1]) ** 2 <= contact_range ** 2
)
result = close_contact_pairs.map(lambda pair: (pair[1][0], pair[0][0]))
print(result.collect())

[(12336, 12336), (13121, 13121), (14149, 14149), (19682, 19682), (29124, 29124), (29547, 29547), (42768, 42768), (52271, 52271), (53509, 53509), (54380, 54380), (56996, 56996), (62345, 52271), (60826, 60826), (62508, 62508), (66100, 66100), (72252, 72252), (33428, 134760), (70070, 161828), (78572, 78572), (88609, 88609), (89728, 89728), (94692, 78572), (91366, 91366), (39501, 207185), (2122, 307894), (6189, 402530), (16507, 375278), (22864, 426956), (39130, 489577), (54375, 316236), (62199, 485865), (67750, 422429), (67362, 433709), (86294, 341202), (521, 761471), (48778, 764377), (46138, 978404), (59410, 721066), (59968, 764377), (78677, 737954), (88687, 724042), (160708, 53509), (102474, 102474), (104416, 104416), (106185, 106185), (114047, 114047), (134760, 134760), (136513, 136513), (138655, 169030), (141874, 78572), (143786, 143786), (145589, 145589), (154176, 154176), (161828, 161828), (164267, 164267), (169030, 169030), (140227, 190158), (154024, 207250), (168615, 179269), (1731

## Task 1 - Query 2

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("COVID-19").getOrCreate()
people_large_df = spark.read.csv("PEOPLE-large.csv", header=True, inferSchema=True)
infected_small_df = spark.read.csv("INFECTED-small.csv", header=True, inferSchema=True)
contact_range = 6.0
people_large_rdd = people_large_df.select("id", "x", "y").rdd.map(lambda row: (row.id, (row.x, row.y)))
infected_small_rdd = infected_small_df.select("id", "x", "y").rdd.map(lambda row: (row.id, (row.x, row.y)))

# Applying cartesian product to find all pairs of infected and non-infected
all_pairs = infected_small_rdd.cartesian(people_large_rdd)
close_contact_pairs = all_pairs.filter(
    lambda pair: (pair[0][1][0] - pair[1][1][0]) ** 2 + (pair[0][1][1] - pair[1][1][1]) ** 2 <= contact_range ** 2
)

# Extracting unique pi.id values
unique_contacts = close_contact_pairs.map(lambda pair: pair[1][0]).distinct()

print(unique_contacts.collect())

[42768, 70070, 91366, 39501, 138655, 154176, 173767, 137445, 207185, 228250, 251218, 254936, 263868, 268928, 294767, 300311, 302588, 312994, 325204, 344223, 349844, 382448, 435908, 410751, 449603, 489577, 554785, 592669, 652817, 761365, 702845, 724042, 785686, 817971, 831798, 874302, 884818, 914584, 29547, 66100, 89728, 67750, 187078, 190158, 213379, 244443, 231177, 312269, 378016, 385166, 472616, 433709, 548868, 570021, 506936, 666887, 649694, 670660, 766195, 819732, 778196, 780242, 859156, 920591, 919997, 924353, 941161, 960334, 987724, 969431, 995787, 997492, 998933, 54375, 106185, 154024, 179269, 99563, 210509, 375278, 426956, 453202, 405726, 500326, 518278, 548264, 514032, 594739, 587017, 627662, 593342, 650960, 673862, 679879, 698018, 769925, 814442, 908624, 877747, 884479, 928655, 943164, 999374, 979255, 998516, 14149, 19682, 39130, 136513, 179237, 103458, 165795, 313459, 360616, 312821, 323579, 352652, 382528, 385212, 408455, 428321, 429619, 460683, 461486, 478140, 419400, 5044

## Task 1 - Query 3

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("COVID-19").getOrCreate()
people_some_infected_large_df = spark.read.csv("PEOPLE-SOME-INFECTED-large.csv", header=True, inferSchema=True)
contact_range = 6.0
people_some_infected_df = people_some_infected_large_df.select("id", "x", "y", "INFECTED")
infected_people_df = people_some_infected_df.filter(people_some_infected_df.INFECTED == "yes")
infected_people_list = infected_people_df.collect()
def within_contact_range(point1, point2):
    return (point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2 <= contact_range ** 2

# Counting close contacts for each infected person
close_contacts_count = people_some_infected_df.rdd.flatMap(lambda p_j: [(infect_i.id, 1) if within_contact_range(infect_i, p_j) else (infect_i.id, 0) for infect_i in infected_people_list])

result = close_contacts_count.reduceByKey(lambda count1, count2: count1 + count2)
print(result.collect())

[(12336, 1), (29124, 3), (42768, 2), (62508, 1), (72252, 2), (134760, 2), (154176, 2), (212112, 3), (217212, 3), (263868, 3), (286032, 1), (293820, 2), (316236, 3), (355992, 2), (385212, 1), (390420, 2), (478140, 3), (585408, 2), (622164, 2), (735312, 2), (780684, 2), (813348, 4), (862956, 1), (873252, 1), (896664, 1), (943164, 1), (989244, 1), (14149, 1), (53509, 4), (88609, 1), (136513, 1), (179269, 3), (313933, 2), (353665, 2), (489577, 2), (515377, 1), (537337, 4), (554785, 1), (587017, 2), (587557, 3), (650005, 2), (764377, 5), (843025, 2), (941161, 1), (19682, 2), (143786, 2), (375278, 2), (385166, 2), (402530, 4), (427214, 3), (455750, 2), (461486, 1), (605162, 1), (673862, 1), (737954, 2), (851942, 1), (29547, 2), (210207, 3), (244443, 2), (281571, 3), (344223, 3), (410751, 1), (460683, 1), (893511, 2), (66100, 1), (89728, 4), (104416, 1), (382528, 1), (398656, 4), (459256, 2), (469456, 4), (643384, 1), (670660, 1), (694996, 5), (730120, 1), (890548, 3), (891448, 2), (914584, 2