In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
.appName("RDD Operations") \
.master("yarn") \
.getOrCreate()

25/05/18 08:11:09 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
customers_data = [
"customer_id,name,city,state,country,registration_date,is_active",
"0,Customer_0,Bangalore,Karnataka,India,2023-11-11,True",
"1,Customer_1,Hyderabad,Delhi,India,2023-08-26,True",
"2,Customer_2,Ahmedabad,West Bengal,India,2023-06-23,True",
"3,Customer_3,Bangalore,Tamil Nadu,India,2023-03-24,False",
"4,Customer_4,Bangalore,Gujarat,India,2023-06-06,False",
"5,Customer_5,Delhi,Maharashtra,India,2023-04-19,False"]

In [4]:
data_rdd = spark.sparkContext.parallelize(customers_data)

In [5]:
data_rdd.collect()

                                                                                

['customer_id,name,city,state,country,registration_date,is_active',
 '0,Customer_0,Bangalore,Karnataka,India,2023-11-11,True',
 '1,Customer_1,Hyderabad,Delhi,India,2023-08-26,True',
 '2,Customer_2,Ahmedabad,West Bengal,India,2023-06-23,True',
 '3,Customer_3,Bangalore,Tamil Nadu,India,2023-03-24,False',
 '4,Customer_4,Bangalore,Gujarat,India,2023-06-06,False',
 '5,Customer_5,Delhi,Maharashtra,India,2023-04-19,False']

In [6]:
data_rdd.getNumPartitions()

2

In [7]:
# First operation - to get the first elemet of RDD
header = data_rdd.first()
header

                                                                                

'customer_id,name,city,state,country,registration_date,is_active'

filter() -> based on conditions 

In [8]:
data_rdd = data_rdd.filter(lambda row : row !=header)

In [9]:
data_rdd

PythonRDD[2] at RDD at PythonRDD.scala:53

In [10]:
data_rdd.collect()

['0,Customer_0,Bangalore,Karnataka,India,2023-11-11,True',
 '1,Customer_1,Hyderabad,Delhi,India,2023-08-26,True',
 '2,Customer_2,Ahmedabad,West Bengal,India,2023-06-23,True',
 '3,Customer_3,Bangalore,Tamil Nadu,India,2023-03-24,False',
 '4,Customer_4,Bangalore,Gujarat,India,2023-06-06,False',
 '5,Customer_5,Delhi,Maharashtra,India,2023-04-19,False']

### map() - It appliess a functionto each element in an RDD.

In [11]:
data_rdd.collect()

['0,Customer_0,Bangalore,Karnataka,India,2023-11-11,True',
 '1,Customer_1,Hyderabad,Delhi,India,2023-08-26,True',
 '2,Customer_2,Ahmedabad,West Bengal,India,2023-06-23,True',
 '3,Customer_3,Bangalore,Tamil Nadu,India,2023-03-24,False',
 '4,Customer_4,Bangalore,Gujarat,India,2023-06-06,False',
 '5,Customer_5,Delhi,Maharashtra,India,2023-04-19,False']

In [13]:
first_row = data_rdd.first()
print(first_row)
first_row = first_row.split(',')
print(first_row)
print(first_row[0])
print(first_row[1])

0,Customer_0,Bangalore,Karnataka,India,2023-11-11,True
['0', 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-11', 'True']
0
Customer_0


In [23]:
def parse_row(row):
    fields = row.split(',')
    return (
    int(fields[0]),
        fields[1],
        fields[2],
        fields[3],
        fields[4],
        fields[5],
        fields[6] == 'True'
    )

In [24]:
parsed_rdd = data_rdd.map(parse_row)

In [25]:
parsed_rdd.collect()

                                                                                

[(0, 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-11', True),
 (1, 'Customer_1', 'Hyderabad', 'Delhi', 'India', '2023-08-26', True),
 (2, 'Customer_2', 'Ahmedabad', 'West Bengal', 'India', '2023-06-23', True),
 (3, 'Customer_3', 'Bangalore', 'Tamil Nadu', 'India', '2023-03-24', False),
 (4, 'Customer_4', 'Bangalore', 'Gujarat', 'India', '2023-06-06', False),
 (5, 'Customer_5', 'Delhi', 'Maharashtra', 'India', '2023-04-19', False)]

Advanced RDD Operations

In [26]:
#Extract a field with map() - Customer and city

In [27]:
name_city_rdd = parsed_rdd.map(lambda row : (row[1],row[2]))
name_city_rdd.first()

('Customer_0', 'Bangalore')

In [28]:
# Filter out active customers
active_cutomers = parsed_rdd.filter(lambda row: row[6] == True)

In [29]:
active_cutomers.collect()

[(0, 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-11', True),
 (1, 'Customer_1', 'Hyderabad', 'Delhi', 'India', '2023-08-26', True),
 (2, 'Customer_2', 'Ahmedabad', 'West Bengal', 'India', '2023-06-23', True)]

In [30]:
# distinct - transformation
cities_rdd = parsed_rdd.map(lambda row : row[2]).distinct()

In [31]:
cities_rdd.collect()

                                                                                

['Hyderabad', 'Delhi', 'Bangalore', 'Ahmedabad']

In [36]:
# take - see 3 cities 
cities_rdd.take(1)

['Hyderabad']

In [37]:
cities_rdd.take(2)

['Hyderabad', 'Delhi']

In [40]:
# Reduce by key - transformation, combines values for each key using an associative reduce function.
# We can chain the functions
customers_per_city = parsed_rdd.map(lambda row : (row[2],1)).reduceByKey(lambda x,y:x+y)

In [41]:
customers_per_city.collect()

[('Hyderabad', 1), ('Delhi', 1), ('Bangalore', 3), ('Ahmedabad', 1)]

In [45]:
# CollectByValue action
cust_per_city = parsed_rdd.map(lambda row : row[2]).countByValue()

In [46]:
cust_per_city

defaultdict(int, {'Bangalore': 3, 'Hyderabad': 1, 'Ahmedabad': 1, 'Delhi': 1})

### Combine more operations - reduceByKey is a transformation while countByValue is an action.

In [48]:
active_cities = parsed_rdd.filter(lambda row : row[6]) \
                          .map(lambda row : row[2]) \
                          .distinct()

active_cities.collect()

['Hyderabad', 'Bangalore', 'Ahmedabad']

In [50]:
# Count active customers by state.
active_customers = parsed_rdd.filter(lambda row:row[6])\
                            .map(lambda row: (row[3],1))\
                            .reduceByKey( lambda x,y:x+y)

active_customers.collect()

[('Delhi', 1), ('Karnataka', 1), ('West Bengal', 1)]

### map : https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.map.html
### filter : https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.filter.html
### reduceByKey : https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.reduceByKey.html

In [51]:
spark.stop()