# Apache Saprk

## Import spark as highlevel API (SQL)

In [1]:
from pyspark.sql import SparkSession

## Create Spark Session

In [2]:
spark = SparkSession.builder.appName('NewStart').getOrCreate() # Higher level API Spark Session (SQL API)

25/08/23 16:45:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
spark

In [5]:
spark = SparkSession.builder.appName('WordCountDemo').master('yarn').getOrCreate() # Lower level API Spark Session (yarn)

In [3]:
spark

## Local Data

In [4]:
data= ["Goku vegeta Gohan",
       "Goku Frieza Goku",
       "Vegeta Goku Freiza Gohan",
       "Gohan Frieza Goku Goku"]   # Defining Data

## Load local data into Spark RDD

In [5]:
rdd = spark.sparkContext.parallelize(data)

In [6]:
rdd.collect()

['Goku vegeta Gohan',
 'Goku Frieza Goku',
 'Vegeta Goku Freiza Gohan',
 'Gohan Frieza Goku Goku']

## Run HDFS Comment in PySpark

In [7]:
hdfs_path = '/tmp/input.txt'

In [8]:
!hadoop fs -ls /tmp/

Found 7 items
drwxr-xr-x   - root            hadoop          0 2025-08-22 17:03 /tmp/active_cities.csv
-rw-r--r--   2 sekar_dhana8644 hadoop   10528211 2025-08-21 18:37 /tmp/customers_10mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop    1060750 2025-08-21 18:37 /tmp/customers_1mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop       5488 2025-08-21 18:40 /tmp/first_100_customers.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-13 21:45 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-13 21:45 /tmp/hive
-rw-r--r--   2 root            hadoop         83 2025-08-19 13:33 /tmp/input.txt


In [9]:
!hadoop fs -cat /tmp/input.txt

Goku Vegeta Gohan
Goku Frieza Goku
Vegeta Goku Frieza Gohan
Gohan Frieza Goku Goku


## Load File from HDFS to Spark RDD

In [10]:
rdd_data = spark.sparkContext.textFile(hdfs_path)

In [11]:
rdd_data.collect()

                                                                                

['Goku Vegeta Gohan',
 'Goku Frieza Goku',
 'Vegeta Goku Frieza Gohan',
 'Gohan Frieza Goku Goku']

## Perform functions

In [12]:
rdd_map = rdd_data.map(lambda line: line.split(' '))

In [13]:
rdd_map.collect()

                                                                                

[['Goku', 'Vegeta', 'Gohan'],
 ['Goku', 'Frieza', 'Goku'],
 ['Vegeta', 'Goku', 'Frieza', 'Gohan'],
 ['Gohan', 'Frieza', 'Goku', 'Goku']]

In [14]:
rdd_filter = rdd_map.filter(lambda words:'Gohan'in words)

In [15]:
rdd_filter.collect()

[['Goku', 'Vegeta', 'Gohan'],
 ['Vegeta', 'Goku', 'Frieza', 'Gohan'],
 ['Gohan', 'Frieza', 'Goku', 'Goku']]

In [16]:
word = rdd_data.flatMap(lambda line:line.split(' '))

In [17]:
word.collect()

['Goku',
 'Vegeta',
 'Gohan',
 'Goku',
 'Frieza',
 'Goku',
 'Vegeta',
 'Goku',
 'Frieza',
 'Gohan',
 'Gohan',
 'Frieza',
 'Goku',
 'Goku']

In [18]:
word_map = word.map(lambda word:(word,1))

In [19]:
word_map.collect()

[('Goku', 1),
 ('Vegeta', 1),
 ('Gohan', 1),
 ('Goku', 1),
 ('Frieza', 1),
 ('Goku', 1),
 ('Vegeta', 1),
 ('Goku', 1),
 ('Frieza', 1),
 ('Gohan', 1),
 ('Gohan', 1),
 ('Frieza', 1),
 ('Goku', 1),
 ('Goku', 1)]

In [20]:
result = word_map.reduceByKey(lambda a, b : a+b)

In [21]:
result.collect()

[('Goku', 6), ('Vegeta', 2), ('Gohan', 3), ('Frieza', 3)]

### Combine many function into single exe line

In [22]:
word_count = rdd_data.flatMap(lambda line:line.split(' ')).map(lambda word: (word,1)).reduceByKey(lambda a,b : a+b)

In [23]:
word_count.collect()

[('Goku', 6), ('Vegeta', 2), ('Gohan', 3), ('Frieza', 3)]

In [18]:
spark.stop()

## Load HDFS file into Spark

In [24]:
rdd = spark.sparkContext.textFile('/tmp/customers_1mb.csv')

In [25]:
rdd.first()

'customer_id,name,city,state,country,registration_date,is_active'

## Local Data to Spark RDD

In [26]:
customer_data = ['Customer_id, Name, City, State, Country, Reigstration_Date, is_Active',
                '0, Customer_0, Bangalore, Karnataka, India, 2023-11-02, True',
                '1, Customer_1, Hydrabad, Telungana, India, 2023-08-26, True',
                '2, Customer_2, Ahmedabad, Delhi, India, 2023-06-23, True',
                '3, Customer_3, Mumbai, Maharashtra, India, 2023-03-21, False',
                '4, Customer_4, Chennai, TamilNadu, India, 2023-01-31, False',
                '5, Customer_5, Trivandram, Kerala, India, 2023-04-19, False']


In [27]:
rdd = spark.sparkContext.parallelize(customer_data)

In [28]:
header = rdd.first()

In [29]:
header

'Customer_id, Name, City, State, Country, Reigstration_Date, is_Active'

## Remove Header

In [30]:
rdd = rdd.filter(lambda row:row != header)

In [31]:
rdd.collect()

['0, Customer_0, Bangalore, Karnataka, India, 2023-11-02, True',
 '1, Customer_1, Hydrabad, Telungana, India, 2023-08-26, True',
 '2, Customer_2, Ahmedabad, Delhi, India, 2023-06-23, True',
 '3, Customer_3, Mumbai, Maharashtra, India, 2023-03-21, False',
 '4, Customer_4, Chennai, TamilNadu, India, 2023-01-31, False',
 '5, Customer_5, Trivandram, Kerala, India, 2023-04-19, False']

## Spilt rows and parse column data

In [32]:
def parsed_row(row):
    data = row.split(',')
    return (int(data[0]),data[1].strip(),data[2].strip(),data[3].strip(),data[4].strip(),data[5].strip(),data[6].strip()=='True')

In [33]:
parsed_data = rdd.map(parsed_row)

In [34]:
parsed_data.collect()

[(0, 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-02', True),
 (1, 'Customer_1', 'Hydrabad', 'Telungana', 'India', '2023-08-26', True),
 (2, 'Customer_2', 'Ahmedabad', 'Delhi', 'India', '2023-06-23', True),
 (3, 'Customer_3', 'Mumbai', 'Maharashtra', 'India', '2023-03-21', False),
 (4, 'Customer_4', 'Chennai', 'TamilNadu', 'India', '2023-01-31', False),
 (5, 'Customer_5', 'Trivandram', 'Kerala', 'India', '2023-04-19', False)]

## RDD Operations

In [35]:
name_city_rdd = parsed_data.map(lambda row:(row[1], row[2]))

In [36]:
name_city_rdd.first()

('Customer_0', 'Bangalore')

In [37]:
active_rdd = parsed_data.filter(lambda row:row[6]==True)

In [38]:
active_rdd.collect()

[(0, 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-02', True),
 (1, 'Customer_1', 'Hydrabad', 'Telungana', 'India', '2023-08-26', True),
 (2, 'Customer_2', 'Ahmedabad', 'Delhi', 'India', '2023-06-23', True)]

In [39]:
cities_rdd = parsed_data.map(lambda row:row[2]).distinct()

In [40]:
cities_rdd.collect()

['Hydrabad', 'Mumbai', 'Trivandram', 'Bangalore', 'Ahmedabad', 'Chennai']

In [41]:
cities_rdd.take(1)

['Hydrabad']

In [42]:
customers_per_city = parsed_data.map(lambda row: (row[2], 1)).reduceByKey(lambda x, y: x+y)

In [43]:
customers_per_city.collect()

[('Hydrabad', 1),
 ('Mumbai', 1),
 ('Trivandram', 1),
 ('Bangalore', 1),
 ('Ahmedabad', 1),
 ('Chennai', 1)]

In [44]:
customers_per_country = parsed_data.map(lambda row: (row[4], 1)).reduceByKey(lambda x, y: x+y)

In [45]:
customers_per_country.collect()

[('India', 6)]

In [46]:
parsed_data.map(lambda row:row[4]).countByValue()

defaultdict(int, {'India': 6})

# Combine more operations

In [47]:
active_cities = parsed_data.filter(lambda row: row[6]).map(lambda row: row[2]).distinct()

In [48]:
active_cities.collect()

['Hydrabad', 'Bangalore', 'Ahmedabad']

In [49]:
active_customers_by_state = parsed_data.filter(lambda row:row[6]).map(lambda row:(row[3],1)).reduceByKey(lambda x, y:x+y)

In [50]:
active_customers_by_state.collect()

[('Delhi', 1), ('Karnataka', 1), ('Telungana', 1)]

In [51]:
active_cities.saveAsTextFile('active_cities.csv')

# ReduceByKey vs GroupByKey

In [52]:
!hadoop fs -ls /tmp/

Found 7 items
drwxr-xr-x   - root            hadoop          0 2025-08-22 17:03 /tmp/active_cities.csv
-rw-r--r--   2 sekar_dhana8644 hadoop   10528211 2025-08-21 18:37 /tmp/customers_10mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop    1060750 2025-08-21 18:37 /tmp/customers_1mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop       5488 2025-08-21 18:40 /tmp/first_100_customers.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-13 21:45 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-13 21:45 /tmp/hive
-rw-r--r--   2 root            hadoop         83 2025-08-19 13:33 /tmp/input.txt


In [53]:
hdfs_path = '/tmp/customers_1mb.csv'

In [54]:
!hadoop fs -head /tmp/customers_1mb.csv

customer_id,name,city,state,country,registration_date,is_active
0,Customer_0,Pune,Maharashtra,India,2023-06-29,False
1,Customer_1,Bangalore,Tamil Nadu,India,2023-12-07,True
2,Customer_2,Hyderabad,Gujarat,India,2023-10-27,True
3,Customer_3,Bangalore,Karnataka,India,2023-10-17,False
4,Customer_4,Ahmedabad,Karnataka,India,2023-03-14,False
5,Customer_5,Hyderabad,Karnataka,India,2023-07-28,False
6,Customer_6,Pune,Delhi,India,2023-08-29,False
7,Customer_7,Ahmedabad,West Bengal,India,2023-12-28,True
8,Customer_8,Pune,Karnataka,India,2023-06-22,True
9,Customer_9,Mumbai,Telangana,India,2023-01-05,True
10,Customer_10,Pune,Gujarat,India,2023-08-05,True
11,Customer_11,Delhi,West Bengal,India,2023-08-02,False
12,Customer_12,Chennai,Gujarat,India,2023-11-21,False
13,Customer_13,Chennai,Karnataka,India,2023-11-06,True
14,Customer_14,Hyderabad,Tamil Nadu,India,2023-02-07,False
15,Customer_15,Mumbai,Gujarat,India,2023-03-02,True
16,Customer_16,Chennai,Karnataka,India,2023-04-05,False
17,Customer_17,Hyd

In [55]:
rdd_data = spark.sparkContext.textFile(hdfs_path)

In [56]:
header = rdd_data.first()

In [57]:
rdd_no_header = rdd_data.filter(lambda row: row != header).map(lambda row : row.split(','))

In [58]:
rdd_no_header.take(1)

[['0', 'Customer_0', 'Pune', 'Maharashtra', 'India', '2023-06-29', 'False']]

In [59]:
reduce_rdd = rdd_no_header.map(lambda row : (row[2], 1)).reduceByKey(lambda x, y : x+y)

In [60]:
reduce_rdd.collect()

[('Pune', 2243),
 ('Hyderabad', 2242),
 ('Mumbai', 2142),
 ('Delhi', 2200),
 ('Bangalore', 2211),
 ('Ahmedabad', 2198),
 ('Chennai', 2194),
 ('Kolkata', 2223)]

In [61]:
group_by_rdd = rdd_no_header.map(lambda row : (row[2], 1)).groupByKey()

In [62]:
group_by_rdd.collect()

[('Pune', <pyspark.resultiterable.ResultIterable at 0x7f858550c350>),
 ('Hyderabad', <pyspark.resultiterable.ResultIterable at 0x7f858551c390>),
 ('Mumbai', <pyspark.resultiterable.ResultIterable at 0x7f858550c890>),
 ('Delhi', <pyspark.resultiterable.ResultIterable at 0x7f85859ccc50>),
 ('Bangalore', <pyspark.resultiterable.ResultIterable at 0x7f858550d390>),
 ('Ahmedabad', <pyspark.resultiterable.ResultIterable at 0x7f85859f3110>),
 ('Chennai', <pyspark.resultiterable.ResultIterable at 0x7f85857042d0>),
 ('Kolkata', <pyspark.resultiterable.ResultIterable at 0x7f8585524490>)]

In [63]:
group_by_result = group_by_rdd.map(lambda row : (row[0], len(row[1])))

In [64]:
group_by_result.collect()

[('Pune', 2243),
 ('Hyderabad', 2242),
 ('Mumbai', 2142),
 ('Delhi', 2200),
 ('Bangalore', 2211),
 ('Ahmedabad', 2198),
 ('Chennai', 2194),
 ('Kolkata', 2223)]

# Partitions, Repartitions, Coalesce

In [6]:
rdd.getNumPartitions()

2

In [7]:
repartition_rdd = rdd.repartition(4)

In [8]:
repartition_rdd.getNumPartitions()

4

In [9]:
repartition_rdd_less = repartition_rdd.repartition(1)

In [10]:
repartition_rdd_less.getNumPartitions()

1

In [12]:
coalesce_rdd = repartition_rdd.coalesce(1)

In [13]:
coalesce_rdd.getNumPartitions()

1

## Spark DataFrame

In [4]:
df = spark.read.format('csv').option('header', 'true').option('inferschema', 'true').load('/tmp/customers_1mb.csv')

                                                                                

In [5]:
df.show(5)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 5 rows



In [6]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



## Spark table

In [7]:
df.createOrReplaceTempView('customers')

In [8]:
result = spark.sql('Select city, count(*) from customers group by city')

In [9]:
result.show()

+---------+--------+
|     city|count(1)|
+---------+--------+
|    Delhi|    2200|
|  Kolkata|    2223|
|Hyderabad|    2242|
|Bangalore|    2211|
|Ahmedabad|    2198|
|  Chennai|    2194|
|   Mumbai|    2142|
|     Pune|    2243|
+---------+--------+



## DataFrame Basics

In [10]:
data = [(0, 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-12', True),
       (1, 'Customer_1', 'Chennai', 'TamilNadu', 'India', '2023-08-26', True),
       (2, 'Customer_2', 'Trivandram', 'Kerala', 'India', '2023-03-31', False)]

In [11]:
columns = ['Customer_Id', 'Name', 'City', 'State', 'Country', 'Registration_Date', 'Is_Active']

In [12]:
df = spark.createDataFrame(data, columns)

In [13]:
df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+-----------+----------+----------+---------+-------+-----------------+---------+
|Customer_Id|      Name|      City|    State|Country|Registration_Date|Is_Active|
+-----------+----------+----------+---------+-------+-----------------+---------+
|          0|Customer_0| Bangalore|Karnataka|  India|       2023-11-12|     true|
|          1|Customer_1|   Chennai|TamilNadu|  India|       2023-08-26|     true|
|          2|Customer_2|Trivandram|   Kerala|  India|       2023-03-31|    false|
+-----------+----------+----------+---------+-------+-----------------+---------+



                                                                                

In [16]:
df.select('Name').show()

+----------+
|      Name|
+----------+
|Customer_0|
|Customer_1|
|Customer_2|
+----------+



### Reading Data from CSV on HDFS worker nodes

In [17]:
!hadoop fs -ls /data/

Found 1 items
-rw-r--r--   2 root hadoop       5488 2025-08-23 17:19 /data/customers_100.csv


In [20]:
df_2 = spark.read.format('csv').option('header', 'true').option('inferschema', 'true').load('/data/customers_100.csv')

In [21]:
df_2.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    false|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    false|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

In [22]:
df_2.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



In [23]:
active_customers = df_2.filter('is_active=true')

In [24]:
active_customers.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|     true|
|          9| Customer_9|   Mumbai|  Telangana|  India|       2023-01-05|     true|
|         10|Customer_10|     Pune|    Gujarat|  India|       2023-08-05|     true|
|         13|Customer_13|  Chennai|  Karnataka|  India|       2023-11-06|     true|
|         15|Customer_15|   Mumbai|    Gujarat|  India|       2023-03-02|     true|
|         18|Customer_18|     Pune|      Delhi|  India|       2023-10-04|   

In [None]:
selected_c