In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName('Read CSV Example')\
.getOrCreate()

In [2]:
data = """ call_id,caller,receiver,city,call_type,duration_seconds,cost
C001,Amit,Rahul,Hyderabad,Local,180,2.5
C002,Neha,Arjun,Bangalore,STD,320,6.0
C003,Rahul,Pooja,Delhi,Local,60,1.0
C004,Pooja,Neha,Mumbai,ISD,900,25.0
C005,Arjun,Amit,Chennai,STD,400,7.5
C006,Sneha,Karan,Hyderabad,Local,240,3.0
C007,Karan,Sneha,Delhi,Local,120,2.0
C008,Riya,Vikas,Bangalore,STD,360,6.5
C009,Vikas,Riya,Mumbai,ISD,1100,30.0
C010,Anjali,Sanjay,Chennai,Local,90,1.5
C011,Farhan,Ayesha,Delhi,STD,420,7.0
C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0
C013,Suresh,Divya,Bangalore,Local,150,2.0
C014,Divya,Suresh,Mumbai,STD,380,6.8
C015,Nikhil,Priya,Delhi,Local,200,2.8
C016,Priya,Nikhil,Chennai,STD,410,7.2
C017,Rohit,Kavya,Hyderabad,Local,170,2.3
C018,Kavya,Rohit,Bangalore,Local,140,2.1
C019,Manish,Tina,Mumbai,ISD,1000,27.0
C020,Tina,Manish,Delhi,STD,350,6.2
"""

with open("Telecom_Call_Records.csv", "w") as f:
  f.write(data)

Task 1

Read the CSV file using sparkContext.textFile and display the first 5 records.

In [3]:
rdd = spark.sparkContext.textFile("Telecom_Call_Records.csv")
rdd.take(5)

[' call_id,caller,receiver,city,call_type,duration_seconds,cost',
 'C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0']

Task 2

Remove the header row and create a clean RDD containing only data rows.

In [4]:
header = rdd.first()
data_rdd = rdd.filter(lambda row: row != header)
data_rdd.collect()

['C001,Amit,Rahul,Hyderabad,Local,180,2.5',
 'C002,Neha,Arjun,Bangalore,STD,320,6.0',
 'C003,Rahul,Pooja,Delhi,Local,60,1.0',
 'C004,Pooja,Neha,Mumbai,ISD,900,25.0',
 'C005,Arjun,Amit,Chennai,STD,400,7.5',
 'C006,Sneha,Karan,Hyderabad,Local,240,3.0',
 'C007,Karan,Sneha,Delhi,Local,120,2.0',
 'C008,Riya,Vikas,Bangalore,STD,360,6.5',
 'C009,Vikas,Riya,Mumbai,ISD,1100,30.0',
 'C010,Anjali,Sanjay,Chennai,Local,90,1.5',
 'C011,Farhan,Ayesha,Delhi,STD,420,7.0',
 'C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0',
 'C013,Suresh,Divya,Bangalore,Local,150,2.0',
 'C014,Divya,Suresh,Mumbai,STD,380,6.8',
 'C015,Nikhil,Priya,Delhi,Local,200,2.8',
 'C016,Priya,Nikhil,Chennai,STD,410,7.2',
 'C017,Rohit,Kavya,Hyderabad,Local,170,2.3',
 'C018,Kavya,Rohit,Bangalore,Local,140,2.1',
 'C019,Manish,Tina,Mumbai,ISD,1000,27.0',
 'C020,Tina,Manish,Delhi,STD,350,6.2']

Task 3

Split each row into individual fields using a delimiter.

In [7]:
split_rdd = data_rdd.map(lambda row: row.split(","))
split_rdd.take(3)

[['C001', 'Amit', 'Rahul', 'Hyderabad', 'Local', '180', '2.5'],
 ['C002', 'Neha', 'Arjun', 'Bangalore', 'STD', '320', '6.0'],
 ['C003', 'Rahul', 'Pooja', 'Delhi', 'Local', '60', '1.0']]

Task 4

Calculate the total call cost per city.

In [9]:
city_call_rdd = split_rdd.map(lambda x: (x[3], float(x[6])))
city_call_rdd.collect()

[('Hyderabad', 2.5),
 ('Bangalore', 6.0),
 ('Delhi', 1.0),
 ('Mumbai', 25.0),
 ('Chennai', 7.5),
 ('Hyderabad', 3.0),
 ('Delhi', 2.0),
 ('Bangalore', 6.5),
 ('Mumbai', 30.0),
 ('Chennai', 1.5),
 ('Delhi', 7.0),
 ('Hyderabad', 28.0),
 ('Bangalore', 2.0),
 ('Mumbai', 6.8),
 ('Delhi', 2.8),
 ('Chennai', 7.2),
 ('Hyderabad', 2.3),
 ('Bangalore', 2.1),
 ('Mumbai', 27.0),
 ('Delhi', 6.2)]

In [10]:
total_call_per_city = city_call_rdd.reduceByKey(lambda x, y: x + y)
total_call_per_city.collect()

[('Hyderabad', 35.8),
 ('Delhi', 19.0),
 ('Mumbai', 88.8),
 ('Bangalore', 16.6),
 ('Chennai', 16.2)]

Calculate the total call cost per city.

Task 5

Identify the city with the highest total call cost.

In [11]:
highest_city = total_call_per_city.reduce(
    lambda x, y: x if x[1] > y[1] else y
)
highest_city

('Mumbai', 88.8)

Task 6
Calculate the total call duration per call type (Local, STD, ISD).

In [13]:
calls_type_duration = split_rdd.map(lambda x: (x[4], int(x[5]))).reduceByKey(lambda x, y: x + y)
calls_type_duration.collect()

[('Local', 1350), ('STD', 2640), ('ISD', 3950)]

Task 7

Count the number of calls per city.

In [14]:
calls_per_city = split_rdd.map(lambda x: (x[3], 1)).reduceByKey(lambda x, y: x + y)
calls_per_city.collect()

[('Hyderabad', 4),
 ('Delhi', 5),
 ('Mumbai', 4),
 ('Bangalore', 4),
 ('Chennai', 3)]

Task 7

Count the number of calls per city.

In [17]:
city_cost_pair = split_rdd.map(lambda x: (x[3], (float(x[6]), 1)))
city_cost_sum = city_cost_pair.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
avg_call_cost_city = city_cost_sum.mapValues(lambda x: x[0] / x[1])
avg_call_cost_city.collect()

[('Hyderabad', 8.95),
 ('Delhi', 3.8),
 ('Mumbai', 22.2),
 ('Bangalore', 4.15),
 ('Chennai', 5.3999999999999995)]

Task 9

Filter and list all high-value calls where call cost is greater than 20.

In [19]:
high_value_calls = split_rdd.filter(lambda x: float(x[6]) > 20)
high_value_calls.collect()

[['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'],
 ['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]

Task 10

Count the number of ISD calls per city.

In [21]:
isd_calls_per_city = (
    split_rdd.filter(lambda x: x[4] == "ISD")
    .map(lambda x: (x[3], 1))
    .reduceByKey(lambda x, y: x + y)
)
isd_calls_per_city.collect()

[('Mumbai', 3), ('Hyderabad', 1)]

Task 11

Identify the longest call based on call duration.

In [23]:
longest_call = split_rdd.max(
    key = lambda x: int(x[5])
)
longest_call

['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0']

Task 12

Calculate the total revenue generated by each caller.

In [24]:
revenue_per_caller = (
    split_rdd.map(lambda x: (x[1], float(x[5]))) \
    .reduceByKey(lambda x, y: x + y)
)
revenue_per_caller.collect()

[('Amit', 180.0),
 ('Pooja', 900.0),
 ('Karan', 120.0),
 ('Riya', 360.0),
 ('Vikas', 1100.0),
 ('Suresh', 150.0),
 ('Divya', 380.0),
 ('Nikhil', 200.0),
 ('Rohit', 170.0),
 ('Manish', 1000.0),
 ('Tina', 350.0),
 ('Neha', 320.0),
 ('Rahul', 60.0),
 ('Arjun', 400.0),
 ('Sneha', 240.0),
 ('Anjali', 90.0),
 ('Farhan', 420.0),
 ('Ayesha', 950.0),
 ('Priya', 410.0),
 ('Kavya', 140.0)]

Task 13

Detect suspicious calls based on the following rule:
duration greater than 900 seconds
cost greater than 25

In [26]:
suspicious_calls = split_rdd.filter(
    lambda x: int(x[5]) > 900 and float(x[6]) > 25)
suspicious_calls.collect()

[['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]