In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Rdd Example").getOrCreate()

In [2]:
data = """call_id,caller,receiver,city,call_type,duration_seconds,cost
C001,Amit,Rahul,Hyderabad,Local,180,2.5
C002,Neha,Arjun,Bangalore,STD,320,6.0
C003,Rahul,Pooja,Delhi,Local,60,1.0
C004,Pooja,Neha,Mumbai,ISD,900,25.0
C005,Arjun,Amit,Chennai,STD,400,7.5
C006,Sneha,Karan,Hyderabad,Local,240,3.0
C007,Karan,Sneha,Delhi,Local,120,2.0
C008,Riya,Vikas,Bangalore,STD,360,6.5
C009,Vikas,Riya,Mumbai,ISD,1100,30.0
C010,Anjali,Sanjay,Chennai,Local,90,1.5
C011,Farhan,Ayesha,Delhi,STD,420,7.0
C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0
C013,Suresh,Divya,Bangalore,Local,150,2.0
C014,Divya,Suresh,Mumbai,STD,380,6.8
C015,Nikhil,Priya,Delhi,Local,200,2.8
C016,Priya,Nikhil,Chennai,STD,410,7.2
C017,Rohit,Kavya,Hyderabad,Local,170,2.3
C018,Kavya,Rohit,Bangalore,Local,140,2.1
C019,Manish,Tina,Mumbai,ISD,1000,27.0
C020,Tina,Manish,Delhi,STD,350,6.2
"""
with open("call_records.csv", "w") as f:
    f.write(data)

#Task

In [3]:
#Read the CSV file using sparkContext.textFile and display the first 5 records.
rdd = spark.sparkContext.textFile("call_records.csv")
rdd.take(5)

['call_id,caller,receiver,city,call_type,duration_seconds,cost',
 'C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0']

In [6]:
#Remove the header row and create a clean RDD containing only data rows.
header = rdd.first()
data_rdd = rdd.filter(lambda x: x != header)
data_rdd.collect()

['C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0',
 'C005,Arjun,Amit,Chennai,STD,400,7.5',
 'C006,Sneha,Karan,Hyderabad,Local,240,3.0',
 'C007,Karan,Sneha,Delhi,Local,120,2.0',
 'C008,Riya,Vikas,Bangalore,STD,360,6.5',
 'C009,Vikas,Riya,Mumbai,ISD,1100,30.0',
 'C010,Anjali,Sanjay,Chennai,Local,90,1.5',
 'C011,Farhan,Ayesha,Delhi,STD,420,7.0',
 'C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0',
 'C013,Suresh,Divya,Bangalore,Local,150,2.0',
 'C014,Divya,Suresh,Mumbai,STD,380,6.8',
 'C015,Nikhil,Priya,Delhi,Local,200,2.8',
 'C016,Priya,Nikhil,Chennai,STD,410,7.2',
 'C017,Rohit,Kavya,Hyderabad,Local,170,2.3',
 'C018,Kavya,Rohit,Bangalore,Local,140,2.1',
 'C019,Manish,Tina,Mumbai,ISD,1000,27.0',
 'C020,Tina,Manish,Delhi,STD,350,6.2']

In [7]:
#Split each row into individual fields using a delimiter.
fields = data_rdd.map(lambda x: x.split(","))
fields.collect()

[['C001', 'Amit', 'Rahul', 'Hyderabad', 'Local', '180', '2.5'],
 ['C002', 'Neha', 'Arjun', 'Bangalore', 'STD', '320', '6.0'],
 ['C003', 'Rahul', 'Pooja', 'Delhi', 'Local', '60', '1.0'],
 ['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'],
 ['C005', 'Arjun', 'Amit', 'Chennai', 'STD', '400', '7.5'],
 ['C006', 'Sneha', 'Karan', 'Hyderabad', 'Local', '240', '3.0'],
 ['C007', 'Karan', 'Sneha', 'Delhi', 'Local', '120', '2.0'],
 ['C008', 'Riya', 'Vikas', 'Bangalore', 'STD', '360', '6.5'],
 ['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C010', 'Anjali', 'Sanjay', 'Chennai', 'Local', '90', '1.5'],
 ['C011', 'Farhan', 'Ayesha', 'Delhi', 'STD', '420', '7.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C013', 'Suresh', 'Divya', 'Bangalore', 'Local', '150', '2.0'],
 ['C014', 'Divya', 'Suresh', 'Mumbai', 'STD', '380', '6.8'],
 ['C015', 'Nikhil', 'Priya', 'Delhi', 'Local', '200', '2.8'],
 ['C016', 'Priya', 'Nikhil', 'Chennai', 'STD', '410', '7.2'],
 ['

In [8]:
#Calculate the total call cost per city.
city_cost = (fields.map(lambda x: (x[3], float(x[6])))
.reduceByKey(lambda x, y: x + y))

city_cost.collect()

[('Hyderabad', 35.8),
 ('Delhi', 19.0),
 ('Mumbai', 88.8),
 ('Bangalore', 16.6),
 ('Chennai', 16.2)]

In [9]:
#Identify the city with the highest total call cost.
highest_city = city_cost.reduce(
    lambda a, b: a if a[1] > b[1] else b
)
highest_city

('Mumbai', 88.8)

In [10]:
#Calculate the total call duration per call type (Local, STD, ISD).
calltype_duration = (fields.map(lambda x: (x[4], int(x[5])))
.reduceByKey(lambda x, y: x + y))

calltype_duration.collect()

[('Local', 1350), ('STD', 2640), ('ISD', 3950)]

In [11]:
#Count the number of calls per city.
calls_per_city = (fields.map(lambda x: (x[3], 1))
.reduceByKey(lambda x, y: x + y))

calls_per_city.collect()

[('Hyderabad', 4),
 ('Delhi', 5),
 ('Mumbai', 4),
 ('Bangalore', 4),
 ('Chennai', 3)]

In [16]:
#Calculate the average call cost per city using RDD transformations.
city_sum_count = (fields.map(lambda x: (x[3], (float(x[6]), 1)))
.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])))

city_avg_cost = city_sum_count.map(lambda x: (x[0], x[1][0]/x[1][1]))
city_avg_cost.collect()

[('Hyderabad', 8.95),
 ('Delhi', 3.8),
 ('Mumbai', 22.2),
 ('Bangalore', 4.15),
 ('Chennai', 5.3999999999999995)]

In [17]:
#Filter and list all high-value calls where call cost is greater than 20.
high_value_calls = (fields.filter(lambda x: float(x[6]) > 20))
high_value_calls.collect()


[['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'],
 ['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]

In [18]:
#Count the number of ISD calls per city.
isd_calls_per_city = (fields.filter(lambda x: x[4] == "ISD").map(lambda x: (x[3], 1))
.reduceByKey(lambda x, y: x + y))

isd_calls_per_city.collect()


[('Mumbai', 3), ('Hyderabad', 1)]

In [19]:
#Identify the longest call based on call duration.
longest_call = fields.max(key = lambda x: int(x[5]))
longest_call

['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0']

In [20]:
#Calculate the total revenue generated by each caller.
caller_revenue = (fields.map(lambda x: (x[1], float(x[6]))).reduceByKey(lambda x, y: x + y))
caller_revenue.collect()

[('Amit', 2.5),
 ('Pooja', 25.0),
 ('Karan', 2.0),
 ('Riya', 6.5),
 ('Vikas', 30.0),
 ('Suresh', 2.0),
 ('Divya', 6.8),
 ('Nikhil', 2.8),
 ('Rohit', 2.3),
 ('Manish', 27.0),
 ('Tina', 6.2),
 ('Neha', 6.0),
 ('Rahul', 1.0),
 ('Arjun', 7.5),
 ('Sneha', 3.0),
 ('Anjali', 1.5),
 ('Farhan', 7.0),
 ('Ayesha', 28.0),
 ('Priya', 7.2),
 ('Kavya', 2.1)]

In [22]:
"""Detect suspicious calls based on the following rule:
duration greater than 900 seconds
cost greater than 25"""

sus_calls = fields.filter(lambda x: int(x[5]) > 900 and float(x[6]) > 25)

sus_calls.collect()

[['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]