In [5]:
import csv
# Set up SparkContext
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("AverageFriendsByAge").setMaster("local[*]")
sc = SparkContext(conf=conf)

In [2]:
# Path to file, change it
file_path = "/content/fakefriends.csv"

def parse_line(line):
    fields = line.split(",")
    age = int(fields[2])
    num_friends = int(fields[3])
    return (age, num_friends)

lines = sc.textFile(file_path)
age_friends_rdd = lines.map(parse_line)

In [3]:
# Transform, Reduce
totals_by_age = age_friends_rdd.mapValues(lambda x: (x, 1))

sum_count_by_age = totals_by_age.reduceByKey(
    lambda x, y: (x[0] + y[0], x[1] + y[1]))

# Average: (age, (sum, count)) → (age, avg)
avg_friends_by_age = sum_count_by_age.mapValues(lambda x: round(x[0] / x[1], 2))


In [4]:
results = avg_friends_by_age.collect()
for age, avg_friends in sorted(results):
    print(f"({age}, {avg_friends})")

(18, 343.38)
(19, 213.27)
(20, 165.0)
(21, 350.88)
(22, 206.43)
(23, 246.3)
(24, 233.8)
(25, 197.45)
(26, 242.06)
(27, 228.12)
(28, 209.1)
(29, 215.92)
(30, 235.82)
(31, 267.25)
(32, 207.91)
(33, 325.33)
(34, 245.5)
(35, 211.62)
(36, 246.6)
(37, 249.33)
(38, 193.53)
(39, 169.29)
(40, 250.82)
(41, 268.56)
(42, 303.5)
(43, 230.57)
(44, 282.17)
(45, 309.54)
(46, 223.69)
(47, 233.22)
(48, 281.4)
(49, 184.67)
(50, 254.6)
(51, 302.14)
(52, 340.64)
(53, 222.86)
(54, 278.08)
(55, 295.54)
(56, 306.67)
(57, 258.83)
(58, 116.55)
(59, 220.0)
(60, 202.71)
(61, 256.22)
(62, 220.77)
(63, 384.0)
(64, 281.33)
(65, 298.2)
(66, 276.44)
(67, 214.62)
(68, 269.6)
(69, 235.2)


In [7]:
# Export to CSV
output_path = "/content/average_friends_by_age.csv"

with open(output_path, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Age", "AverageFriends"])  # Header
    writer.writerows(sorted(results))

print(f"CSV file saved to: {output_path}")

CSV file saved to: /content/average_friends_by_age.csv


In [8]:
# Stop SparkContext
sc.stop()
