In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName('Read CSV Example')\
.getOrCreate()

In [2]:
data = """id,name,city,age,salary
1,Arjun,Hyderabad,25,45000
2,Meera,Chennai,32,52000
3,Rajesh,Bangalore,29,61000
4,Priya,Delhi,22,38000
5,Sanjay,Mumbai,35,72000
6,Kavya,Hyderabad,28,48000
7,Imran,Delhi,31,53000
8,Divya,Chennai,27,45000
9,Anil,Bangalore,40,85000
10,Ritu,Mumbai,23,39000
"""

with open("employees.csv", "w") as f:
  f.write(data)

In [3]:
rdd = spark.sparkContext.textFile("employees.csv")
rdd.take(5)

['id,name,city,age,salary',
 '1,Arjun,Hyderabad,25,45000',
 '2,Meera,Chennai,32,52000',
 '3,Rajesh,Bangalore,29,61000',
 '4,Priya,Delhi,22,38000']

In [4]:
header = rdd.first()
data_rdd = rdd.filter(lambda row: row != header)
data_rdd.collect()

['1,Arjun,Hyderabad,25,45000',
 '2,Meera,Chennai,32,52000',
 '3,Rajesh,Bangalore,29,61000',
 '4,Priya,Delhi,22,38000',
 '5,Sanjay,Mumbai,35,72000',
 '6,Kavya,Hyderabad,28,48000',
 '7,Imran,Delhi,31,53000',
 '8,Divya,Chennai,27,45000',
 '9,Anil,Bangalore,40,85000',
 '10,Ritu,Mumbai,23,39000']

In [5]:
split_rdd = data_rdd.map(lambda row: row.split(","))
split_rdd.take(3)

[['1', 'Arjun', 'Hyderabad', '25', '45000'],
 ['2', 'Meera', 'Chennai', '32', '52000'],
 ['3', 'Rajesh', 'Bangalore', '29', '61000']]

In [6]:
city_salary_rdd = split_rdd.map(lambda x: (x[2], float(x[4])))
city_salary_rdd.collect()

[('Hyderabad', 45000.0),
 ('Chennai', 52000.0),
 ('Bangalore', 61000.0),
 ('Delhi', 38000.0),
 ('Mumbai', 72000.0),
 ('Hyderabad', 48000.0),
 ('Delhi', 53000.0),
 ('Chennai', 45000.0),
 ('Bangalore', 85000.0),
 ('Mumbai', 39000.0)]

In [7]:
total_salary_per_city = city_salary_rdd.reduceByKey(lambda x, y: x + y)
total_salary_per_city.collect()

[('Hyderabad', 93000.0),
 ('Delhi', 91000.0),
 ('Mumbai', 111000.0),
 ('Chennai', 97000.0),
 ('Bangalore', 146000.0)]

In [8]:
highest_city = total_salary_per_city.reduce(
    lambda x, y: x if x[1] > y[1] else y
)
highest_city

('Bangalore', 146000.0)