# Apache Saprk

## Import spark as low level and high level API (SQL)

In [1]:
from pyspark.sql import SparkSession

## Create Spark Session

In [2]:
spark = SparkSession.builder.appName('NewStart').getOrCreate() # Higher level API Spark Session (SQL API)

25/08/23 16:45:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
spark

In [5]:
spark = SparkSession.builder.appName('WordCountDemo').master('yarn').getOrCreate() # Lower level API Spark Session (yarn)

In [3]:
spark

## Local Data

In [4]:
data= ["Goku vegeta Gohan",
       "Goku Frieza Goku",
       "Vegeta Goku Freiza Gohan",
       "Gohan Frieza Goku Goku"]   # Defining Data

## Load local data into Spark RDD

In [5]:
rdd = spark.sparkContext.parallelize(data)

In [6]:
rdd.collect()

['Goku vegeta Gohan',
 'Goku Frieza Goku',
 'Vegeta Goku Freiza Gohan',
 'Gohan Frieza Goku Goku']

## Run HDFS Comment in PySpark

In [8]:
!hadoop fs -ls /tmp/

Found 7 items
drwxr-xr-x   - root            hadoop          0 2025-08-22 17:03 /tmp/active_cities.csv
-rw-r--r--   2 sekar_dhana8644 hadoop   10528211 2025-08-21 18:37 /tmp/customers_10mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop    1060750 2025-08-21 18:37 /tmp/customers_1mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop       5488 2025-08-21 18:40 /tmp/first_100_customers.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-13 21:45 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-13 21:45 /tmp/hive
-rw-r--r--   2 root            hadoop         83 2025-08-19 13:33 /tmp/input.txt


In [9]:
!hadoop fs -cat /tmp/input.txt

Goku Vegeta Gohan
Goku Frieza Goku
Vegeta Goku Frieza Gohan
Gohan Frieza Goku Goku


## Load File from HDFS to Spark RDD

In [7]:
hdfs_path = '/tmp/input.txt'

In [10]:
rdd_data = spark.sparkContext.textFile(hdfs_path)

In [11]:
rdd_data.collect()

                                                                                

['Goku Vegeta Gohan',
 'Goku Frieza Goku',
 'Vegeta Goku Frieza Gohan',
 'Gohan Frieza Goku Goku']

### Perform functions

In [12]:
rdd_map = rdd_data.map(lambda line: line.split(' '))

In [13]:
rdd_map.collect()

                                                                                

[['Goku', 'Vegeta', 'Gohan'],
 ['Goku', 'Frieza', 'Goku'],
 ['Vegeta', 'Goku', 'Frieza', 'Gohan'],
 ['Gohan', 'Frieza', 'Goku', 'Goku']]

In [14]:
rdd_filter = rdd_map.filter(lambda words:'Gohan'in words)

In [15]:
rdd_filter.collect()

[['Goku', 'Vegeta', 'Gohan'],
 ['Vegeta', 'Goku', 'Frieza', 'Gohan'],
 ['Gohan', 'Frieza', 'Goku', 'Goku']]

In [16]:
word = rdd_data.flatMap(lambda line:line.split(' '))

In [17]:
word.collect()

['Goku',
 'Vegeta',
 'Gohan',
 'Goku',
 'Frieza',
 'Goku',
 'Vegeta',
 'Goku',
 'Frieza',
 'Gohan',
 'Gohan',
 'Frieza',
 'Goku',
 'Goku']

In [18]:
word_map = word.map(lambda word:(word,1))

In [19]:
word_map.collect()

[('Goku', 1),
 ('Vegeta', 1),
 ('Gohan', 1),
 ('Goku', 1),
 ('Frieza', 1),
 ('Goku', 1),
 ('Vegeta', 1),
 ('Goku', 1),
 ('Frieza', 1),
 ('Gohan', 1),
 ('Gohan', 1),
 ('Frieza', 1),
 ('Goku', 1),
 ('Goku', 1)]

In [20]:
result = word_map.reduceByKey(lambda a, b : a+b)

In [21]:
result.collect()

[('Goku', 6), ('Vegeta', 2), ('Gohan', 3), ('Frieza', 3)]

### Combine many function into single exe line

In [22]:
word_count = rdd_data.flatMap(lambda line:line.split(' ')).map(lambda word: (word,1)).reduceByKey(lambda a,b : a+b)

In [23]:
word_count.collect()

[('Goku', 6), ('Vegeta', 2), ('Gohan', 3), ('Frieza', 3)]

In [18]:
spark.stop()

## Load file from HDFS into Spark

In [24]:
rdd = spark.sparkContext.textFile('/tmp/customers_1mb.csv')

In [25]:
rdd.first()

'customer_id,name,city,state,country,registration_date,is_active'

## Local Data to Spark RDD

In [26]:
customer_data = ['Customer_id, Name, City, State, Country, Reigstration_Date, is_Active',
                '0, Customer_0, Bangalore, Karnataka, India, 2023-11-02, True',
                '1, Customer_1, Hydrabad, Telungana, India, 2023-08-26, True',
                '2, Customer_2, Ahmedabad, Delhi, India, 2023-06-23, True',
                '3, Customer_3, Mumbai, Maharashtra, India, 2023-03-21, False',
                '4, Customer_4, Chennai, TamilNadu, India, 2023-01-31, False',
                '5, Customer_5, Trivandram, Kerala, India, 2023-04-19, False']


In [27]:
rdd = spark.sparkContext.parallelize(customer_data)

In [28]:
header = rdd.first()

In [29]:
header

'Customer_id, Name, City, State, Country, Reigstration_Date, is_Active'

### Remove Header

In [30]:
rdd = rdd.filter(lambda row:row != header)

In [31]:
rdd.collect()

['0, Customer_0, Bangalore, Karnataka, India, 2023-11-02, True',
 '1, Customer_1, Hydrabad, Telungana, India, 2023-08-26, True',
 '2, Customer_2, Ahmedabad, Delhi, India, 2023-06-23, True',
 '3, Customer_3, Mumbai, Maharashtra, India, 2023-03-21, False',
 '4, Customer_4, Chennai, TamilNadu, India, 2023-01-31, False',
 '5, Customer_5, Trivandram, Kerala, India, 2023-04-19, False']

### Spilt rows and parse column data

In [32]:
def parsed_row(row):
    data = row.split(',')
    return (int(data[0]),data[1].strip(),data[2].strip(),data[3].strip(),data[4].strip(),data[5].strip(),data[6].strip()=='True')

In [33]:
parsed_data = rdd.map(parsed_row)

In [34]:
parsed_data.collect()

[(0, 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-02', True),
 (1, 'Customer_1', 'Hydrabad', 'Telungana', 'India', '2023-08-26', True),
 (2, 'Customer_2', 'Ahmedabad', 'Delhi', 'India', '2023-06-23', True),
 (3, 'Customer_3', 'Mumbai', 'Maharashtra', 'India', '2023-03-21', False),
 (4, 'Customer_4', 'Chennai', 'TamilNadu', 'India', '2023-01-31', False),
 (5, 'Customer_5', 'Trivandram', 'Kerala', 'India', '2023-04-19', False)]

## RDD Operations

In [35]:
name_city_rdd = parsed_data.map(lambda row:(row[1], row[2]))

In [36]:
name_city_rdd.first()

('Customer_0', 'Bangalore')

In [37]:
active_rdd = parsed_data.filter(lambda row:row[6]==True)

In [38]:
active_rdd.collect()

[(0, 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-02', True),
 (1, 'Customer_1', 'Hydrabad', 'Telungana', 'India', '2023-08-26', True),
 (2, 'Customer_2', 'Ahmedabad', 'Delhi', 'India', '2023-06-23', True)]

In [39]:
cities_rdd = parsed_data.map(lambda row:row[2]).distinct()

In [40]:
cities_rdd.collect()

['Hydrabad', 'Mumbai', 'Trivandram', 'Bangalore', 'Ahmedabad', 'Chennai']

In [41]:
cities_rdd.take(1)

['Hydrabad']

In [42]:
customers_per_city = parsed_data.map(lambda row: (row[2], 1)).reduceByKey(lambda x, y: x+y)

In [43]:
customers_per_city.collect()

[('Hydrabad', 1),
 ('Mumbai', 1),
 ('Trivandram', 1),
 ('Bangalore', 1),
 ('Ahmedabad', 1),
 ('Chennai', 1)]

In [44]:
customers_per_country = parsed_data.map(lambda row: (row[4], 1)).reduceByKey(lambda x, y: x+y)

In [45]:
customers_per_country.collect()

[('India', 6)]

In [46]:
parsed_data.map(lambda row:row[4]).countByValue()

defaultdict(int, {'India': 6})

## Combine more operations

In [47]:
active_cities = parsed_data.filter(lambda row: row[6]).map(lambda row: row[2]).distinct()

In [48]:
active_cities.collect()

['Hydrabad', 'Bangalore', 'Ahmedabad']

In [49]:
active_customers_by_state = parsed_data.filter(lambda row:row[6]).map(lambda row:(row[3],1)).reduceByKey(lambda x, y:x+y)

In [50]:
active_customers_by_state.collect()

[('Delhi', 1), ('Karnataka', 1), ('Telungana', 1)]

In [51]:
active_cities.saveAsTextFile('active_cities.csv')

## ReduceByKey vs GroupByKey

In [52]:
!hadoop fs -ls /tmp/

Found 7 items
drwxr-xr-x   - root            hadoop          0 2025-08-22 17:03 /tmp/active_cities.csv
-rw-r--r--   2 sekar_dhana8644 hadoop   10528211 2025-08-21 18:37 /tmp/customers_10mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop    1060750 2025-08-21 18:37 /tmp/customers_1mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop       5488 2025-08-21 18:40 /tmp/first_100_customers.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-13 21:45 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-13 21:45 /tmp/hive
-rw-r--r--   2 root            hadoop         83 2025-08-19 13:33 /tmp/input.txt


In [53]:
hdfs_path = '/tmp/customers_1mb.csv'

In [54]:
!hadoop fs -head /tmp/customers_1mb.csv

customer_id,name,city,state,country,registration_date,is_active
0,Customer_0,Pune,Maharashtra,India,2023-06-29,False
1,Customer_1,Bangalore,Tamil Nadu,India,2023-12-07,True
2,Customer_2,Hyderabad,Gujarat,India,2023-10-27,True
3,Customer_3,Bangalore,Karnataka,India,2023-10-17,False
4,Customer_4,Ahmedabad,Karnataka,India,2023-03-14,False
5,Customer_5,Hyderabad,Karnataka,India,2023-07-28,False
6,Customer_6,Pune,Delhi,India,2023-08-29,False
7,Customer_7,Ahmedabad,West Bengal,India,2023-12-28,True
8,Customer_8,Pune,Karnataka,India,2023-06-22,True
9,Customer_9,Mumbai,Telangana,India,2023-01-05,True
10,Customer_10,Pune,Gujarat,India,2023-08-05,True
11,Customer_11,Delhi,West Bengal,India,2023-08-02,False
12,Customer_12,Chennai,Gujarat,India,2023-11-21,False
13,Customer_13,Chennai,Karnataka,India,2023-11-06,True
14,Customer_14,Hyderabad,Tamil Nadu,India,2023-02-07,False
15,Customer_15,Mumbai,Gujarat,India,2023-03-02,True
16,Customer_16,Chennai,Karnataka,India,2023-04-05,False
17,Customer_17,Hyd

In [55]:
rdd_data = spark.sparkContext.textFile(hdfs_path)

In [56]:
header = rdd_data.first()

In [57]:
rdd_no_header = rdd_data.filter(lambda row: row != header).map(lambda row : row.split(','))

In [58]:
rdd_no_header.take(1)

[['0', 'Customer_0', 'Pune', 'Maharashtra', 'India', '2023-06-29', 'False']]

In [59]:
reduce_rdd = rdd_no_header.map(lambda row : (row[2], 1)).reduceByKey(lambda x, y : x+y)

In [60]:
reduce_rdd.collect()

[('Pune', 2243),
 ('Hyderabad', 2242),
 ('Mumbai', 2142),
 ('Delhi', 2200),
 ('Bangalore', 2211),
 ('Ahmedabad', 2198),
 ('Chennai', 2194),
 ('Kolkata', 2223)]

In [61]:
group_by_rdd = rdd_no_header.map(lambda row : (row[2], 1)).groupByKey()

In [62]:
group_by_rdd.collect()

[('Pune', <pyspark.resultiterable.ResultIterable at 0x7f858550c350>),
 ('Hyderabad', <pyspark.resultiterable.ResultIterable at 0x7f858551c390>),
 ('Mumbai', <pyspark.resultiterable.ResultIterable at 0x7f858550c890>),
 ('Delhi', <pyspark.resultiterable.ResultIterable at 0x7f85859ccc50>),
 ('Bangalore', <pyspark.resultiterable.ResultIterable at 0x7f858550d390>),
 ('Ahmedabad', <pyspark.resultiterable.ResultIterable at 0x7f85859f3110>),
 ('Chennai', <pyspark.resultiterable.ResultIterable at 0x7f85857042d0>),
 ('Kolkata', <pyspark.resultiterable.ResultIterable at 0x7f8585524490>)]

In [63]:
group_by_result = group_by_rdd.map(lambda row : (row[0], len(row[1])))

In [64]:
group_by_result.collect()

[('Pune', 2243),
 ('Hyderabad', 2242),
 ('Mumbai', 2142),
 ('Delhi', 2200),
 ('Bangalore', 2211),
 ('Ahmedabad', 2198),
 ('Chennai', 2194),
 ('Kolkata', 2223)]

## Partitions, Repartitions, Coalesce

In [6]:
rdd.getNumPartitions()

2

In [7]:
repartition_rdd = rdd.repartition(4)

In [8]:
repartition_rdd.getNumPartitions()

4

In [9]:
repartition_rdd_less = repartition_rdd.repartition(1)

In [10]:
repartition_rdd_less.getNumPartitions()

1

In [12]:
coalesce_rdd = repartition_rdd.coalesce(1)

In [13]:
coalesce_rdd.getNumPartitions()

1

## Spark DataFrame

In [4]:
df = spark.read.format('csv').option('header', 'true').option('inferschema', 'true').load('/tmp/customers_1mb.csv')

                                                                                

In [5]:
df.show(5)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 5 rows



In [6]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



## Spark table

In [7]:
df.createOrReplaceTempView('customers')

In [8]:
result = spark.sql('Select city, count(*) from customers group by city')

In [9]:
result.show()

+---------+--------+
|     city|count(1)|
+---------+--------+
|    Delhi|    2200|
|  Kolkata|    2223|
|Hyderabad|    2242|
|Bangalore|    2211|
|Ahmedabad|    2198|
|  Chennai|    2194|
|   Mumbai|    2142|
|     Pune|    2243|
+---------+--------+



## DataFrame Basics

In [10]:
data = [(0, 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-12', True),
       (1, 'Customer_1', 'Chennai', 'TamilNadu', 'India', '2023-08-26', True),
       (2, 'Customer_2', 'Trivandram', 'Kerala', 'India', '2023-03-31', False)]

In [11]:
columns = ['Customer_Id', 'Name', 'City', 'State', 'Country', 'Registration_Date', 'Is_Active']

In [12]:
df = spark.createDataFrame(data, columns)

In [13]:
df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+-----------+----------+----------+---------+-------+-----------------+---------+
|Customer_Id|      Name|      City|    State|Country|Registration_Date|Is_Active|
+-----------+----------+----------+---------+-------+-----------------+---------+
|          0|Customer_0| Bangalore|Karnataka|  India|       2023-11-12|     true|
|          1|Customer_1|   Chennai|TamilNadu|  India|       2023-08-26|     true|
|          2|Customer_2|Trivandram|   Kerala|  India|       2023-03-31|    false|
+-----------+----------+----------+---------+-------+-----------------+---------+



                                                                                

In [16]:
df.select('Name').show()

+----------+
|      Name|
+----------+
|Customer_0|
|Customer_1|
|Customer_2|
+----------+



### Load file from HDFS into Spark

In [17]:
!hadoop fs -ls /data/

Found 1 items
-rw-r--r--   2 root hadoop       5488 2025-08-23 17:19 /data/customers_100.csv


In [20]:
df_2 = spark.read.format('csv').option('header', 'true').option('inferschema', 'true').load('/data/customers_100.csv')

In [21]:
df_2.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    false|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    false|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

In [22]:
df_2.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



In [23]:
active_customers = df_2.filter('is_active=true')

In [24]:
active_customers.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|     true|
|          9| Customer_9|   Mumbai|  Telangana|  India|       2023-01-05|     true|
|         10|Customer_10|     Pune|    Gujarat|  India|       2023-08-05|     true|
|         13|Customer_13|  Chennai|  Karnataka|  India|       2023-11-06|     true|
|         15|Customer_15|   Mumbai|    Gujarat|  India|       2023-03-02|     true|
|         18|Customer_18|     Pune|      Delhi|  India|       2023-10-04|   

In [26]:
selected_columns = df_2.select('name', 'city', 'registration_date')

In [31]:
selected_columns.show()

+-----------+---------+-----------------+
|       name|     city|registration_date|
+-----------+---------+-----------------+
| Customer_0|     Pune|       2023-06-29|
| Customer_1|Bangalore|       2023-12-07|
| Customer_2|Hyderabad|       2023-10-27|
| Customer_3|Bangalore|       2023-10-17|
| Customer_4|Ahmedabad|       2023-03-14|
| Customer_5|Hyderabad|       2023-07-28|
| Customer_6|     Pune|       2023-08-29|
| Customer_7|Ahmedabad|       2023-12-28|
| Customer_8|     Pune|       2023-06-22|
| Customer_9|   Mumbai|       2023-01-05|
|Customer_10|     Pune|       2023-08-05|
|Customer_11|    Delhi|       2023-08-02|
|Customer_12|  Chennai|       2023-11-21|
|Customer_13|  Chennai|       2023-11-06|
|Customer_14|Hyderabad|       2023-02-07|
|Customer_15|   Mumbai|       2023-03-02|
|Customer_16|  Chennai|       2023-04-05|
|Customer_17|Hyderabad|       2023-08-21|
|Customer_18|     Pune|       2023-10-04|
|Customer_19|  Kolkata|       2023-02-05|
+-----------+---------+-----------

In [32]:
active_customers

DataFrame[customer_id: int, name: string, city: string, state: string, country: string, registration_date: date, is_active: boolean]

### Schema Enforcement

In [29]:
!hadoop fs -ls /data/

Found 1 items
-rw-r--r--   2 root hadoop       5488 2025-08-23 17:19 /data/customers_100.csv


In [46]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, FloatType, DateType

In [47]:
schema = StructType([
         StructField('customer_id', IntegerType(), False),
         StructField('name', StringType(), False),
         StructField('city', StringType(), False),
         StructField('state', StringType(), False),
         StructField('country', StringType(), False),
         StructField('registration_date', DateType(), False),
         StructField('is_active', BooleanType(), False)
         ])

In [48]:
df = spark.read.format('csv').option('header','true').schema(schema).load('/data/customers_100.csv')

In [49]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



In [50]:
df.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    false|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    false|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

### DDL Schema

In [51]:
ddl_schema = 'customer_id INT, name STRING, city STRING, state STRING, country STRING, registration_date DATE, is_active BOOLEAN'

In [52]:
df_ddl = spark.read.format('csv').option('header', 'true').schema(ddl_schema).load('/data/customers_100.csv')

In [53]:
df_ddl.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



In [54]:
df_ddl.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    false|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    false|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

## Read Modes in Spark

### FAILFAST mode

In [63]:
df_failfast = spark.read.format('csv').option('header','true').schema(schema).option('mode', 'FAILFAST').load('/data/customers_100.csv')

In [61]:
df_failfast.show(3)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 3 rows



### PERMISSIVE mode

In [64]:
df_permissive = spark.read.format('csv').option('header','true').schema(schema).option('mode', 'PERMISSIVE').load('/data/customers_100.csv')

In [65]:
df_permissive.show(3)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 3 rows



### DROP MALFORMED mode

In [66]:
df_drop_malformed = spark.read.format('csv').option('header','true').schema(schema).option('mode', 'DROPMALFORMED').load('/data/customers_100.csv')

In [67]:
df_drop_malformed.show(3)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 3 rows



## Write Operations in Spark

In [68]:
data = [
    (1, 'Alice', 'California', '2023-01-23', True),
    (2, 'Bob', 'Texas', '2021-03-10', False),
    (3, 'Charlie', 'New Yark', '2024-12-29', True)
]

columns = ['customer_id', 'name', 'city', 'registration_date', 'is_active']

In [70]:
df = spark.createDataFrame(data, columns)

In [71]:
df.write.format('csv').option('header', 'true').save('/data/write_output.csv')

In [74]:
!hadoop fs -ls /data/write_output.csv

Found 3 items
-rw-r--r--   2 root hadoop          0 2025-08-23 19:30 /data/write_output.csv/_SUCCESS
-rw-r--r--   2 root hadoop         85 2025-08-23 19:30 /data/write_output.csv/part-00000-cb512f69-8a20-45ba-a973-f3500b25a8bd-c000.csv
-rw-r--r--   2 root hadoop        114 2025-08-23 19:30 /data/write_output.csv/part-00001-cb512f69-8a20-45ba-a973-f3500b25a8bd-c000.csv


In [75]:
df.rdd.getNumPartitions()

2

In [77]:
!hadoop fs -cat /data/write_output.csv/*

customer_id,name,city,registration_date,is_active
1,Alice,California,2023-01-23,true
customer_id,name,city,registration_date,is_active
2,Bob,Texas,2021-03-10,false
3,Charlie,New Yark,2024-12-29,true


### Repartition to 1 and write the data

In [79]:
df.repartition(1).write.format('csv').option('header', 'true').save('/data/write_output_repartition.csv')

In [80]:
!hadoop fs -ls /data/

Found 3 items
-rw-r--r--   2 root hadoop       5488 2025-08-23 17:19 /data/customers_100.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:30 /data/write_output.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:42 /data/write_output_repartition.csv


In [83]:
!hadoop fs -cat /data/write_output_repartition.csv/*

customer_id,name,city,registration_date,is_active
1,Alice,California,2023-01-23,true
2,Bob,Texas,2021-03-10,false
3,Charlie,New Yark,2024-12-29,true


In [84]:
df.repartition(1).write.format('csv').option('header', 'true').option('delimiter', '|').save('/data/write_output_delimiter.csv')

In [85]:
!hadoop fs -cat /data/write_output_delimiter.csv/*

customer_id|name|city|registration_date|is_active
1|Alice|California|2023-01-23|true
2|Bob|Texas|2021-03-10|false
3|Charlie|New Yark|2024-12-29|true


## Spark Operations

In [111]:
data = [
        (1, 'Alice', 25),
        (2, 'Bob', 26),
        (3, 'Charlie', 24)
]

columns = ['id', 'name', 'age']

schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('name', StringType(), False),
    StructField('age', StringType(), False)
])

df = spark.createDataFrame(data, columns, schema)

df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 26|
|  3|Charlie| 24|
+---+-------+---+



In [112]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [113]:
df.describe().show()

+-------+---+-------+----+
|summary| id|   name| age|
+-------+---+-------+----+
|  count|  3|      3|   3|
|   mean|2.0|   NULL|25.0|
| stddev|1.0|   NULL| 1.0|
|    min|  1|  Alice|  24|
|    max|  3|Charlie|  26|
+-------+---+-------+----+



In [114]:
df.select('name', 'age')

DataFrame[name: string, age: bigint]

In [115]:
df.select('name', 'age').show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 26|
|Charlie| 24|
+-------+---+



In [116]:
df.filter(df.age>24).show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
|  2|  Bob| 26|
+---+-----+---+



In [117]:
df.where(df.name == 'Alice').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
+---+-----+---+



In [118]:
df.distinct().show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 26|
|  3|Charlie| 24|
+---+-------+---+



In [119]:
df.orderBy('age').show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 24|
|  1|  Alice| 25|
|  2|    Bob| 26|
+---+-------+---+



In [120]:
df.orderBy(df.age.desc()).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 26|
|  1|  Alice| 25|
|  3|Charlie| 24|
+---+-------+---+



In [121]:
df.withColumn('new_age', df.age + 1).show()

+---+-------+---+-------+
| id|   name|age|new_age|
+---+-------+---+-------+
|  1|  Alice| 25|     26|
|  2|    Bob| 26|     27|
|  3|Charlie| 24|     25|
+---+-------+---+-------+



In [123]:
df.drop('age').show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+



In [125]:
df = spark.createDataFrame(data, columns, schema)

df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 26|
|  3|Charlie| 24|
+---+-------+---+



In [126]:
df.groupBy('name').count().show()

+-------+-----+
|   name|count|
+-------+-----+
|  Alice|    1|
|Charlie|    1|
|    Bob|    1|
+-------+-----+



In [127]:
df.agg({'age':'avg'}).show()

+--------+
|avg(age)|
+--------+
|    25.0|
+--------+



In [132]:
data2 = [
    (1, 'usa'),
    (2, 'uk'),
    (3, 'india')
]

column2 = ['id', 'country']

schema2 = StructType([
    StructField('id', IntegerType(), False),
    StructField('country', StringType(), False)
])

df2 = spark.createDataFrame(data2, column2, schema2)

df2.show()

+---+-------+
| id|country|
+---+-------+
|  1|    usa|
|  2|     uk|
|  3|  india|
+---+-------+



In [135]:
joined_df = df.join(df2, 'id')

In [136]:
joined_df.show()

+---+-------+---+-------+
| id|   name|age|country|
+---+-------+---+-------+
|  1|  Alice| 25|    usa|
|  2|    Bob| 26|     uk|
|  3|Charlie| 24|  india|
+---+-------+---+-------+



In [137]:
spark.stop()

## Handling Data Types

In [3]:
data = [
    (1, 'John Doe', 'Bangalore', '2023-01-15', '123.34', 'True'),
    (2, 'Jane Smith', 'Delhi', '2023-05-20', '89.50', 'False'),
    (3, 'Robert Brown', 'Mumbai', 'InvalidDate', '200.00', 'True'),
    (4, 'Linda White', 'Kolkata', '2023-02-19', None, 'yes'),
    (5, 'Mike Green', 'Chennai', '2023-08-10', 'NaN', '1'),
    (6, 'Sarah Blue', 'Hydrabad', 'InvalidDate', '300.40', 'No')    
]

columns = ['id', 'name', 'city', 'date', 'amount', 'is_active']

df = spark.createDataFrame(data, columns)

df.show()

                                                                                

+---+------------+---------+-----------+------+---------+
| id|        name|     city|       date|amount|is_active|
+---+------------+---------+-----------+------+---------+
|  1|    John Doe|Bangalore| 2023-01-15|123.34|     True|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|  4| Linda White|  Kolkata| 2023-02-19|  NULL|      yes|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6|  Sarah Blue| Hydrabad|InvalidDate|300.40|       No|
+---+------------+---------+-----------+------+---------+



In [4]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)



### Handling Integer Column

In [6]:
df.id

Column<'id'>

In [7]:
df['id']

Column<'id'>

In [8]:
df.filter(df.id>3).show()

+---+-----------+--------+-----------+------+---------+
| id|       name|    city|       date|amount|is_active|
+---+-----------+--------+-----------+------+---------+
|  4|Linda White| Kolkata| 2023-02-19|  NULL|      yes|
|  5| Mike Green| Chennai| 2023-08-10|   NaN|        1|
|  6| Sarah Blue|Hydrabad|InvalidDate|300.40|       No|
+---+-----------+--------+-----------+------+---------+



In [9]:
df.withColumn('id_double', df.id*2).show()

+---+------------+---------+-----------+------+---------+---------+
| id|        name|     city|       date|amount|is_active|id_double|
+---+------------+---------+-----------+------+---------+---------+
|  1|    John Doe|Bangalore| 2023-01-15|123.34|     True|        2|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|        4|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|        6|
|  4| Linda White|  Kolkata| 2023-02-19|  NULL|      yes|        8|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|       10|
|  6|  Sarah Blue| Hydrabad|InvalidDate|300.40|       No|       12|
+---+------------+---------+-----------+------+---------+---------+



In [12]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

In [13]:
df = df.withColumn('id', col('id').cast(IntegerType()))

In [16]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)



### Handling String Column

In [18]:
from pyspark.sql.functions import *

In [17]:
df = df.withColumn('name_upper', upper(df.name))

In [19]:
df.show(2)

+---+----------+---------+----------+------+---------+----------+
| id|      name|     city|      date|amount|is_active|name_upper|
+---+----------+---------+----------+------+---------+----------+
|  1|  John Doe|Bangalore|2023-01-15|123.34|     True|  JOHN DOE|
|  2|Jane Smith|    Delhi|2023-05-20| 89.50|    False|JANE SMITH|
+---+----------+---------+----------+------+---------+----------+
only showing top 2 rows



In [26]:
df = df.withColumn('name_lower', lower(df.name))

In [27]:
df.show(2)

+---+----------+---------+----------+------+---------+----------+----------+
| id|      name|     city|      date|amount|is_active|name_upper|name_lower|
+---+----------+---------+----------+------+---------+----------+----------+
|  1|  John Doe|Bangalore|2023-01-15|123.34|     True|  JOHN DOE|  john doe|
|  2|Jane Smith|    Delhi|2023-05-20| 89.50|    False|JANE SMITH|jane smith|
+---+----------+---------+----------+------+---------+----------+----------+
only showing top 2 rows



In [25]:
df.filter(df.city.startswith('B')).show()

+---+--------+---------+----------+------+---------+----------+
| id|    name|     city|      date|amount|is_active|name_upper|
+---+--------+---------+----------+------+---------+----------+
|  1|John Doe|Bangalore|2023-01-15|123.34|     True|  JOHN DOE|
+---+--------+---------+----------+------+---------+----------+



### Handling Float Column

In [28]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)
 |-- name_upper: string (nullable = true)
 |-- name_lower: string (nullable = true)



In [30]:
df = df.withColumn('amount', col('amount').cast('float'))

In [31]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- is_active: string (nullable = true)
 |-- name_upper: string (nullable = true)
 |-- name_lower: string (nullable = true)



In [32]:
df.show()

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|123.34|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20|  89.5|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate| 200.0|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-19|  NULL|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blue| Hydrabad|InvalidDate| 300.4|       No|  SARAH BLUE|  sarah blue|
+---+------------+---------+-----------+------+---------+------------+------------+



### Filling NAs and Nulls

In [34]:
df_filled = df.fillna({'amount': 0})

In [35]:
df_filled.show()

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|123.34|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20|  89.5|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate| 200.0|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-19|   0.0|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   0.0|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blue| Hydrabad|InvalidDate| 300.4|       No|  SARAH BLUE|  sarah blue|
+---+------------+---------+-----------+------+---------+------------+------------+



### Handling Date Column

In [51]:
csv_data = """id,date_iso,date_dmy,date_mdy,timestamp_1,timestamp_2
1,2023-01-15,15/01/2023,01/15/2023,2023-01-15 10:30:00,2023-01-15 10:30:00
2,2023-05-20,20/05/2023,05/20/2023,2023-05-20 15:45:00,2023-05-20 15:45:00
3,InvalidDate,31/02/2023,02/31/2023,InvalidTimestamp,InvalidTimestamp
4,,,,,"""

with open('dates_data.csv', 'w') as f:
    f.write(csv_data)

In [52]:
ls

[0m[01;36mbin[0m@       [01;34mdataforhdfs[0m/    [01;34metc[0m/     [01;36mlib[0m@         [01;34mmedia[0m/  [01;34mproc[0m/  [01;36msbin[0m@  [30;42mtmp[0m/
[01;34mboot[0m/      dates_data.csv  [01;34mhadoop[0m/  [01;36mlib64[0m@       [01;34mmnt[0m/    [01;34mroot[0m/  [01;34msrv[0m/   [01;34musr[0m/
copyright  [01;34mdev[0m/            [01;34mhome[0m/    [01;34mlost+found[0m/  [01;34mopt[0m/    [01;34mrun[0m/   [01;34msys[0m/   [01;34mvar[0m/


In [53]:
!ls *dates_data.csv*

dates_data.csv


In [56]:
!hadoop fs -put dates_data.csv /data/

In [57]:
!hadoop fs -ls /data/

Found 5 items
-rw-r--r--   2 root hadoop       5488 2025-08-23 17:19 /data/customers_100.csv
-rw-r--r--   2 root hadoop        280 2025-08-24 17:21 /data/dates_data.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:30 /data/write_output.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:46 /data/write_output_delimiter.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:42 /data/write_output_repartition.csv


In [58]:
df = spark.read.format('csv').option('header','true').load('/data/dates_data.csv')

In [59]:
df.show()

+---+-----------+----------+----------+-------------------+-------------------+
| id|   date_iso|  date_dmy|  date_mdy|        timestamp_1|        timestamp_2|
+---+-----------+----------+----------+-------------------+-------------------+
|  1| 2023-01-15|15/01/2023|01/15/2023|2023-01-15 10:30:00|2023-01-15 10:30:00|
|  2| 2023-05-20|20/05/2023|05/20/2023|2023-05-20 15:45:00|2023-05-20 15:45:00|
|  3|InvalidDate|31/02/2023|02/31/2023|   InvalidTimestamp|   InvalidTimestamp|
|  4|       NULL|      NULL|      NULL|               NULL|               NULL|
+---+-----------+----------+----------+-------------------+-------------------+



In [60]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- date_iso: string (nullable = true)
 |-- date_dmy: string (nullable = true)
 |-- date_mdy: string (nullable = true)
 |-- timestamp_1: string (nullable = true)
 |-- timestamp_2: string (nullable = true)



In [63]:
ddl_schema = """id INT,
                date_iso DATE,
                date_dmy DATE,
                date_mdy DATE,
                timestamp_1 DATE,
                timestamp_2 TIMESTAMP"""

In [92]:
df = spark.read.format('csv').option('header','true').schema(ddl_schema).load('/data/dates_data.csv')

In [93]:
df.show()

+---+----------+--------+--------+-----------+-------------------+
| id|  date_iso|date_dmy|date_mdy|timestamp_1|        timestamp_2|
+---+----------+--------+--------+-----------+-------------------+
|  1|2023-01-15|    NULL|    NULL| 2023-01-15|2023-01-15 10:30:00|
|  2|2023-05-20|    NULL|    NULL| 2023-05-20|2023-05-20 15:45:00|
|  3|      NULL|    NULL|    NULL|       NULL|               NULL|
|  4|      NULL|    NULL|    NULL|       NULL|               NULL|
+---+----------+--------+--------+-----------+-------------------+



In [94]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date_iso: date (nullable = true)
 |-- date_dmy: date (nullable = true)
 |-- date_mdy: date (nullable = true)
 |-- timestamp_1: date (nullable = true)
 |-- timestamp_2: timestamp (nullable = true)



### Formatting Date Columns

In [95]:
from pyspark.sql.functions import to_date

In [96]:
df = spark.read.format('csv').option('header','true').load('/data/dates_data.csv')
df = df.withColumn('parsed_date_iso', to_date(df.date_iso, 'yyyy-MM-dd'))\
        .withColumn('parsed_date_dmy', to_date(df.date_dmy, 'dd/MM/yyyy'))\
        .withColumn('parsed_date_mdy', to_date(df.date_mdy, 'MM/dd/yyyy'))

In [97]:
df.show()

+---+-----------+----------+----------+-------------------+-------------------+---------------+---------------+---------------+
| id|   date_iso|  date_dmy|  date_mdy|        timestamp_1|        timestamp_2|parsed_date_iso|parsed_date_dmy|parsed_date_mdy|
+---+-----------+----------+----------+-------------------+-------------------+---------------+---------------+---------------+
|  1| 2023-01-15|15/01/2023|01/15/2023|2023-01-15 10:30:00|2023-01-15 10:30:00|     2023-01-15|     2023-01-15|     2023-01-15|
|  2| 2023-05-20|20/05/2023|05/20/2023|2023-05-20 15:45:00|2023-05-20 15:45:00|     2023-05-20|     2023-05-20|     2023-05-20|
|  3|InvalidDate|31/02/2023|02/31/2023|   InvalidTimestamp|   InvalidTimestamp|           NULL|           NULL|           NULL|
|  4|       NULL|      NULL|      NULL|               NULL|               NULL|           NULL|           NULL|           NULL|
+---+-----------+----------+----------+-------------------+-------------------+---------------+---------

In [98]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- date_iso: string (nullable = true)
 |-- date_dmy: string (nullable = true)
 |-- date_mdy: string (nullable = true)
 |-- timestamp_1: string (nullable = true)
 |-- timestamp_2: string (nullable = true)
 |-- parsed_date_iso: date (nullable = true)
 |-- parsed_date_dmy: date (nullable = true)
 |-- parsed_date_mdy: date (nullable = true)



### TimeStamp

In [99]:
from pyspark.sql.functions import to_timestamp, year, month, dayofmonth, hour, minute, second

In [100]:
df = df.withColumn('parsed_timestamp_1', to_timestamp(df.timestamp_1))\
       .withColumn('parsed_timestamp_2', to_timestamp(df.timestamp_2, 'yyyy-MM-dd HH:mm:ss'))

df.show()
df.printSchema()

+---+-----------+----------+----------+-------------------+-------------------+---------------+---------------+---------------+-------------------+-------------------+
| id|   date_iso|  date_dmy|  date_mdy|        timestamp_1|        timestamp_2|parsed_date_iso|parsed_date_dmy|parsed_date_mdy| parsed_timestamp_1| parsed_timestamp_2|
+---+-----------+----------+----------+-------------------+-------------------+---------------+---------------+---------------+-------------------+-------------------+
|  1| 2023-01-15|15/01/2023|01/15/2023|2023-01-15 10:30:00|2023-01-15 10:30:00|     2023-01-15|     2023-01-15|     2023-01-15|2023-01-15 10:30:00|2023-01-15 10:30:00|
|  2| 2023-05-20|20/05/2023|05/20/2023|2023-05-20 15:45:00|2023-05-20 15:45:00|     2023-05-20|     2023-05-20|     2023-05-20|2023-05-20 15:45:00|2023-05-20 15:45:00|
|  3|InvalidDate|31/02/2023|02/31/2023|   InvalidTimestamp|   InvalidTimestamp|           NULL|           NULL|           NULL|               NULL|             

In [90]:
df = df.withColumn('year', year(df.parsed_timestamp_1))\
       .withColumn('month', month(df.parsed_timestamp_1))\
       .withColumn('day', dayofmonth(df.parsed_timestamp_1))\
       .withColumn('hour', hour(df.parsed_timestamp_1))\
       .withColumn('minute', minute(df.parsed_timestamp_1))\
       .withColumn('second', second(df.parsed_timestamp_1))

df.select('parsed_timestamp_1', 'year', 'month', 'day', 'hour', 'minute', 'second').show()
df.select('parsed_timestamp_1', 'year', 'month', 'day', 'hour', 'minute', 'second').printSchema()

+-------------------+----+-----+----+----+------+------+
| parsed_timestamp_1|year|month| day|hour|minute|second|
+-------------------+----+-----+----+----+------+------+
|2023-01-15 10:30:00|2023|    1|  15|  10|    30|     0|
|2023-05-20 15:45:00|2023|    5|  20|  15|    45|     0|
|               NULL|NULL| NULL|NULL|NULL|  NULL|  NULL|
|               NULL|NULL| NULL|NULL|NULL|  NULL|  NULL|
+-------------------+----+-----+----+----+------+------+

root
 |-- parsed_timestamp_1: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- second: integer (nullable = true)



In [101]:
from pyspark.sql.functions import datediff

In [102]:
df = df.withColumn('day_diff', datediff(df.parsed_date_dmy, df.parsed_date_mdy))
df.select(df.parsed_date_dmy, df.parsed_date_mdy, 'day_diff').show()

+---------------+---------------+--------+
|parsed_date_dmy|parsed_date_mdy|day_diff|
+---------------+---------------+--------+
|     2023-01-15|     2023-01-15|       0|
|     2023-05-20|     2023-05-20|       0|
|           NULL|           NULL|    NULL|
|           NULL|           NULL|    NULL|
+---------------+---------------+--------+



## Spark SQL - Table

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Spark SQL Table').enableHiveSupport().getOrCreate()

25/08/25 18:19:28 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
spark

In [106]:
!hadoop fs -ls /tmp/

Found 7 items
drwxr-xr-x   - root            hadoop          0 2025-08-22 17:03 /tmp/active_cities.csv
-rw-r--r--   2 sekar_dhana8644 hadoop   10528211 2025-08-21 18:37 /tmp/customers_10mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop    1060750 2025-08-21 18:37 /tmp/customers_1mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop       5488 2025-08-21 18:40 /tmp/first_100_customers.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-13 21:45 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-13 21:45 /tmp/hive
-rw-r--r--   2 root            hadoop         83 2025-08-19 13:33 /tmp/input.txt


In [107]:
!hadoop fs -ls /data/

Found 5 items
-rw-r--r--   2 root hadoop       5488 2025-08-23 17:19 /data/customers_100.csv
-rw-r--r--   2 root hadoop        280 2025-08-24 17:21 /data/dates_data.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:30 /data/write_output.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:46 /data/write_output_delimiter.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:42 /data/write_output_repartition.csv


In [7]:
df = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load('/data/customers_100.csv')

                                                                                

In [110]:
df.show(5)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 5 rows



In [111]:
spark.sql('show tables').show()

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used


+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



### Temporary Table

In [112]:
df.createOrReplaceTempView('temp_customers')

In [113]:
spark.sql('show tables').show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|         |temp_customers|       true|
+---------+--------------+-----------+



In [114]:
spark.sql('SELECT * FROM temp_customers LIMIT 5').show()

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
+-----------+----------+---------+-----------+-------+-----------------+---------+



In [11]:
spark_new = spark.newSession()

In [116]:
spark_new.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



### Gobal Temporary Table

In [120]:
spark.sql('show tables').show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|         |temp_customers|       true|
+---------+--------------+-----------+



In [119]:
df.createOrReplaceGlobalTempView('global_customers')

In [121]:
spark.sql('show tables in global_temp').show()

+-----------+----------------+-----------+
|  namespace|       tableName|isTemporary|
+-----------+----------------+-----------+
|global_temp|global_customers|       true|
|global_temp|  temp_customers|       true|
|           |  temp_customers|       true|
+-----------+----------------+-----------+



In [123]:
spark_new.sql('show tables in global_temp').show()

+-----------+----------------+-----------+
|  namespace|       tableName|isTemporary|
+-----------+----------------+-----------+
|global_temp|global_customers|       true|
|global_temp|  temp_customers|       true|
+-----------+----------------+-----------+



In [125]:
spark.sql('SELECT * FROM global_temp.global_customers LIMIT 5').show()

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
+-----------+----------+---------+-----------+-------+-----------------+---------+



### Persistent Table

In [25]:
df.write.mode('overwrite').saveAsTable('customers_persistent')

In [9]:
spark.sql('show tables').show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default|customers_persistent|      false|
+---------+--------------------+-----------+



In [12]:
spark_new.sql('show tables').show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default|customers_persistent|      false|
+---------+--------------------+-----------+



In [13]:
spark.sql('SELECT * FROM customers_persistent LIMIT 5').show()

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
+-----------+----------+---------+-----------+-------+-----------------+---------+



In [18]:
!hadoop fs -ls /user/hive/warehouse/

Found 1 items
drwxr-xr-x   - root hadoop          0 2025-08-25 13:14 /user/hive/warehouse/customers_persistent


In [19]:
!hadoop fs -ls /user/hive/warehouse/customers_persistent/

Found 2 items
-rw-r--r--   2 root hadoop          0 2025-08-25 13:14 /user/hive/warehouse/customers_persistent/_SUCCESS
-rw-r--r--   2 root hadoop       3625 2025-08-25 13:14 /user/hive/warehouse/customers_persistent/part-00000-2101a591-641f-45e0-abd4-c803b43de467-c000.snappy.parquet


In [20]:
spark.sql('DESCRIBE EXTENDED customers_persistent').show(truncate=False)

+----------------------------+----------------------------------------------------------------------+-------+
|col_name                    |data_type                                                             |comment|
+----------------------------+----------------------------------------------------------------------+-------+
|customer_id                 |int                                                                   |NULL   |
|name                        |string                                                                |NULL   |
|city                        |string                                                                |NULL   |
|state                       |string                                                                |NULL   |
|country                     |string                                                                |NULL   |
|registration_date           |date                                                                  |NULL   |
|is_active

## Spark SQL

In [4]:
data = [
    (1, 'John Doe', 'Bangalore', '2023-01-15', '123.34', 'True'),
    (2, 'Jane Smith', 'Delhi', '2023-05-20', '89.50', 'False'),
    (3, 'Robert Brown', 'Mumbai', 'InvalidDate', '200.00', 'True'),
    (4, 'Linda White', 'Kolkata', '2023-02-19', None, 'yes'),
    (5, 'Mike Green', 'Chennai', '2023-08-10', 'NaN', '1'),
    (6, 'Sarah Blue', 'Hydrabad', 'InvalidDate', '300.40', 'No')    
]

column = ['customer_id', 'name', 'city', 'date', 'amount', 'is_active']

df = spark.createDataFrame(data, column)

In [5]:
df.show()

                                                                                

+-----------+------------+---------+-----------+------+---------+
|customer_id|        name|     city|       date|amount|is_active|
+-----------+------------+---------+-----------+------+---------+
|          1|    John Doe|Bangalore| 2023-01-15|123.34|     True|
|          2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|
|          3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|          4| Linda White|  Kolkata| 2023-02-19|  NULL|      yes|
|          5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|          6|  Sarah Blue| Hydrabad|InvalidDate|300.40|       No|
+-----------+------------+---------+-----------+------+---------+



In [7]:
spark.sql('show databases').show()

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used


+---------+
|namespace|
+---------+
|  default|
+---------+



In [8]:
spark.sql('use default')

DataFrame[]

In [10]:
# spark.sql('CREATE DATABASE IF NOT EXISTS ecommerce')

In [12]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
|ecommerce|
+---------+



In [13]:
spark.sql('show tables').show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default|customers_persistent|      false|
+---------+--------------------+-----------+



In [15]:
spark.sql('SELECT * FROM customers_persistent LIMIT 5').show()

[Stage 2:>                                                          (0 + 1) / 1]

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
+-----------+----------+---------+-----------+-------+-----------------+---------+



                                                                                

In [16]:
spark.sql('''
CREATE TABLE IF NOT EXISTS customers (
    customer_id INT, 
    name STRING,
    city STRING,
    state STRING,
    country STRING,
    registration_date DATE,
    is_active BOOLEAN
    ) USING CSV
    ''')

25/08/25 16:12:07 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider CSV. Persisting data source table `spark_catalog`.`default`.`customers` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
25/08/25 16:12:07 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


DataFrame[]

In [26]:
spark.sql('show tables').show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default|           customers|      false|
|  default|customers_persistent|      false|
+---------+--------------------+-----------+



In [18]:
spark.sql('DESCRIBE EXTENDED customers').show(truncate=False)

+----------------------------+-----------------------------------------------------------+-------+
|col_name                    |data_type                                                  |comment|
+----------------------------+-----------------------------------------------------------+-------+
|customer_id                 |int                                                        |NULL   |
|name                        |string                                                     |NULL   |
|city                        |string                                                     |NULL   |
|state                       |string                                                     |NULL   |
|country                     |string                                                     |NULL   |
|registration_date           |date                                                       |NULL   |
|is_active                   |boolean                                                    |NULL   |
|         

In [19]:
!hadoop fs -ls /user/hive/warehouse/customers

In [20]:
df.write.mode('overwrite').saveAsTable('default.customers')

In [21]:
!hadoop fs -ls /user/hive/warehouse/customers

Found 3 items
-rw-r--r--   2 root hadoop          0 2025-08-25 16:15 /user/hive/warehouse/customers/_SUCCESS
-rw-r--r--   2 root hadoop       1891 2025-08-25 16:15 /user/hive/warehouse/customers/part-00000-6d0120bd-89e6-4b33-a6b4-c4f33a18f350-c000.snappy.parquet
-rw-r--r--   2 root hadoop       1832 2025-08-25 16:15 /user/hive/warehouse/customers/part-00001-6d0120bd-89e6-4b33-a6b4-c4f33a18f350-c000.snappy.parquet


In [22]:
spark.sql('SELECT * FROM customers LIMIT 5').show()

+-----------+------------+---------+-----------+------+---------+
|customer_id|        name|     city|       date|amount|is_active|
+-----------+------------+---------+-----------+------+---------+
|          1|    John Doe|Bangalore| 2023-01-15|123.34|     True|
|          2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|
|          3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|          4| Linda White|  Kolkata| 2023-02-19|  NULL|      yes|
|          5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
+-----------+------------+---------+-----------+------+---------+



In [28]:
!hadoop fs -ls /user/hive/warehouse/

Found 3 items
drwxr-xr-x   - root hadoop          0 2025-08-25 16:15 /user/hive/warehouse/customers
drwxr-xr-x   - root hadoop          0 2025-08-25 16:19 /user/hive/warehouse/customers_persistent
drwxr-xr-x   - root hadoop          0 2025-08-25 16:07 /user/hive/warehouse/ecommerce.db


In [29]:
spark.sql('''
DROP TABLE IF EXISTS customers_persistent
''').show()

++
||
++
++



In [30]:
!hadoop fs -ls /user/hive/warehouse/

Found 2 items
drwxr-xr-x   - root hadoop          0 2025-08-25 16:15 /user/hive/warehouse/customers
drwxr-xr-x   - root hadoop          0 2025-08-25 16:07 /user/hive/warehouse/ecommerce.db


### Spark SQL Managed vs External Tables

#### Managed Table

In [31]:
df = spark.read.format('csv').option('header', 'true').option('inferSchema','true').load('/data/customers_100.csv')

                                                                                

In [32]:
df.show(5)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 5 rows



In [33]:
df.createOrReplaceTempView('temp_customers')

In [34]:
spark.sql('SELECT * FROM temp_customers LIMIT 1').show()

+-----------+----------+----+-----------+-------+-----------------+---------+
|customer_id|      name|city|      state|country|registration_date|is_active|
+-----------+----------+----+-----------+-------+-----------------+---------+
|          0|Customer_0|Pune|Maharashtra|  India|       2023-06-29|    false|
+-----------+----------+----+-----------+-------+-----------------+---------+



In [35]:
spark.sql('SHOW TABLES').show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|  default|     customers|      false|
|         |temp_customers|       true|
+---------+--------------+-----------+



In [36]:
spark.sql('''
    CREATE TABLE managed_customers AS
    SELECT * FROM temp_customers
    ''')

25/08/25 16:33:19 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.
                                                                                

DataFrame[]

In [37]:
spark.sql('DESCRIBE EXTENDED managed_customers').show(truncate=False)

+----------------------------+----------------------------------+-------+
|col_name                    |data_type                         |comment|
+----------------------------+----------------------------------+-------+
|customer_id                 |int                               |NULL   |
|name                        |string                            |NULL   |
|city                        |string                            |NULL   |
|state                       |string                            |NULL   |
|country                     |string                            |NULL   |
|registration_date           |date                              |NULL   |
|is_active                   |boolean                           |NULL   |
|                            |                                  |       |
|# Detailed Table Information|                                  |       |
|Catalog                     |spark_catalog                     |       |
|Database                    |default 

In [38]:
!hadoop fs -ls /user/hive/warehouse/

Found 3 items
drwxr-xr-x   - root hadoop          0 2025-08-25 16:15 /user/hive/warehouse/customers
drwxr-xr-x   - root hadoop          0 2025-08-25 16:07 /user/hive/warehouse/ecommerce.db
drwxr-xr-x   - root hadoop          0 2025-08-25 16:33 /user/hive/warehouse/managed_customers


In [72]:
spark.sql('DROP TABLE managed_customers')

DataFrame[]

In [73]:
!hadoop fs -ls /user/hive/warehouse/

Found 2 items
drwxr-xr-x   - root hadoop          0 2025-08-25 16:15 /user/hive/warehouse/customers
drwxr-xr-x   - root hadoop          0 2025-08-25 16:07 /user/hive/warehouse/ecommerce.db


#### External Table

In [74]:
!hadoop fs -ls /data/

Found 6 items
-rw-r--r--   2 root hadoop       5488 2025-08-23 17:19 /data/customers_100.csv
-rw-r--r--   2 root hadoop        280 2025-08-24 17:21 /data/dates_data.csv
drwxr-xr-x   - root hadoop          0 2025-08-25 16:48 /data/external_data
drwxr-xr-x   - root hadoop          0 2025-08-23 19:30 /data/write_output.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:46 /data/write_output_delimiter.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:42 /data/write_output_repartition.csv


In [75]:
!hadoop fs -mkdir /data/external_data

mkdir: `/data/external_data': File exists


In [51]:
!hadoop fs -cp /data/customers_100.csv /data/external_data/

In [76]:
!hadoop fs -ls /data/external_data/

Found 1 items
-rw-r--r--   2 root hadoop       5488 2025-08-25 16:48 /data/external_data/customers_100.csv


In [77]:
spark.sql('''
    CREATE EXTERNAL TABLE IF NOT EXISTS external_customers (
    customer_id INT, 
    name STRING,
    city STRING,
    state STRING,
    country STRING,
    registration_date DATE,
    is_active BOOLEAN
    ) USING CSV
    LOCATION '/data/external_data/'
    ''')

25/08/25 17:19:15 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider CSV. Persisting data source table `spark_catalog`.`default`.`external_customers` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


DataFrame[]

In [78]:
spark.sql('DESCRIBE EXTENDED external_customers').show(truncate=False)

+----------------------------+--------------------------------------------------+-------+
|col_name                    |data_type                                         |comment|
+----------------------------+--------------------------------------------------+-------+
|customer_id                 |int                                               |NULL   |
|name                        |string                                            |NULL   |
|city                        |string                                            |NULL   |
|state                       |string                                            |NULL   |
|country                     |string                                            |NULL   |
|registration_date           |date                                              |NULL   |
|is_active                   |boolean                                           |NULL   |
|                            |                                                  |       |
|# Detaile

In [79]:
!hadoop fs -ls /data/external_data/

Found 1 items
-rw-r--r--   2 root hadoop       5488 2025-08-25 16:48 /data/external_data/customers_100.csv


In [80]:
spark.sql('SHOW TABLES').show()

+---------+------------------+-----------+
|namespace|         tableName|isTemporary|
+---------+------------------+-----------+
|  default|         customers|      false|
|  default|external_customers|      false|
|         |    temp_customers|       true|
+---------+------------------+-----------+



In [81]:
spark.sql('SELECT * FROM external_customers LIMIT 3').show()

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|       NULL|      name|     city|      state|country|             NULL|     NULL|
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
+-----------+----------+---------+-----------+-------+-----------------+---------+



In [82]:
!hadoop fs -ls /data/external_data/

Found 1 items
-rw-r--r--   2 root hadoop       5488 2025-08-25 16:48 /data/external_data/customers_100.csv


In [83]:
spark.sql('DROP TABLE external_customers')

DataFrame[]

In [84]:
!hadoop fs -ls /data/external_data/

Found 1 items
-rw-r--r--   2 root hadoop       5488 2025-08-25 16:48 /data/external_data/customers_100.csv


In [85]:
spark.sql('SHOW TABLES').show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|  default|     customers|      false|
|         |temp_customers|       true|
+---------+--------------+-----------+



## Overview

### Spark Read Different Methods

#### Spark Read

In [5]:
df_csv = spark.read.format('csv').option('header', 'true').load('/data/customers_100.csv')

                                                                                

In [6]:
df_csv.show(3)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    False|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     True|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     True|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 3 rows



In [7]:
spark.sql('SHOW TABLES').show()

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used


+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|customers|      false|
+---------+---------+-----------+



#### Spark SQL

In [9]:
df_sql = spark.sql('SELECT * FROM customers WHERE is_active = True')
df_sql.show(3)

+-----------+------------+---------+-----------+------+---------+
|customer_id|        name|     city|       date|amount|is_active|
+-----------+------------+---------+-----------+------+---------+
|          1|    John Doe|Bangalore| 2023-01-15|123.34|     True|
|          3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|          4| Linda White|  Kolkata| 2023-02-19|  NULL|      yes|
+-----------+------------+---------+-----------+------+---------+
only showing top 3 rows



                                                                                

#### Spark Table

In [10]:
df_table = spark.table('customers')
df_table.show(3)

+-----------+------------+---------+-----------+------+---------+
|customer_id|        name|     city|       date|amount|is_active|
+-----------+------------+---------+-----------+------+---------+
|          1|    John Doe|Bangalore| 2023-01-15|123.34|     True|
|          2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|
|          3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
+-----------+------------+---------+-----------+------+---------+
only showing top 3 rows



#### Spark DataFrame

In [11]:
data = [
    (1, 'Alice', 'Bangalore', '23-01-15', True),
    (2, 'Bob', 'Delhi', '2023-03-25', False),
    (3, 'Charlie', 'Chennai', '2023-05-10', True)
]

column = ['customer_id', 'name', 'city', 'date', 'is_active']

df_DataFrame = spark.createDataFrame(data, column)
df_DataFrame.show(3)

+-----------+-------+---------+----------+---------+
|customer_id|   name|     city|      date|is_active|
+-----------+-------+---------+----------+---------+
|          1|  Alice|Bangalore|  23-01-15|     true|
|          2|    Bob|    Delhi|2023-03-25|    false|
|          3|Charlie|  Chennai|2023-05-10|     true|
+-----------+-------+---------+----------+---------+



                                                                                

#### RDD - DataFrame

In [13]:
rdd = spark.sparkContext.parallelize([(1, 'Alice'), (2, 'Bob')])
rdd.take(2)                             

[(1, 'Alice'), (2, 'Bob')]

In [15]:
rdd_df = rdd.toDF(['customer_id', 'name'])
rdd_df.show()

+-----------+-----+
|customer_id| name|
+-----------+-----+
|          1|Alice|
|          2|  Bob|
+-----------+-----+



## Persist and Caching

### Cache in RDD

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Spark_Cache_RDD').getOrCreate()

25/08/27 14:17:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
spark

In [5]:
!hadoop fs -ls /tmp/

Found 7 items
drwxr-xr-x   - root            hadoop          0 2025-08-22 17:03 /tmp/active_cities.csv
-rw-r--r--   2 sekar_dhana8644 hadoop   10528211 2025-08-21 18:37 /tmp/customers_10mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop    1060750 2025-08-21 18:37 /tmp/customers_1mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop       5488 2025-08-21 18:40 /tmp/first_100_customers.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-13 21:45 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-13 21:45 /tmp/hive
-rw-r--r--   2 root            hadoop         83 2025-08-19 13:33 /tmp/input.txt


In [8]:
customer_rdd = spark.sparkContext.textFile('/tmp/customers_1mb.csv')
customer_rdd.take(1)

                                                                                

['customer_id,name,city,state,country,registration_date,is_active']

In [10]:
customer_filtered = customer_rdd.filter(lambda row: 'Mumbai' in row)
customer_mapped = customer_filtered.map(lambda row: (row.split(',')[0],1))
customer_reduced = customer_mapped.reduceByKey(lambda a, b: a+b)

In [11]:
customer_reduced.count()

                                                                                

2142

In [12]:
customer_reduced.count()

2142

In [13]:
customer_reduced.cache()

PythonRDD[9] at RDD at PythonRDD.scala:53

In [14]:
customer_reduced.count()

2142

In [15]:
customer_reduced.unpersist()

PythonRDD[9] at RDD at PythonRDD.scala:53

### Cache in DF

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Spark_Cache_DF').getOrCreate()
spark

25/08/27 14:55:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
customers_df = spark.read.option('header', 'true').csv('/tmp/customers_1mb.csv')
customers_df.show(2)

                                                                                

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    False|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     True|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 2 rows



In [3]:
customers_df.rdd.getNumPartitions()

1

In [4]:
customers_df.cache()

DataFrame[customer_id: string, name: string, city: string, state: string, country: string, registration_date: string, is_active: string]

In [5]:
customers_df.show(2)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    False|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     True|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 2 rows



In [6]:
tail_df = customers_df.orderBy('customer_id', ascending=False)

In [8]:
tail_df.show(2)

+-----------+-------------+---------+---------+-------+-----------------+---------+
|customer_id|         name|     city|    state|country|registration_date|is_active|
+-----------+-------------+---------+---------+-------+-----------------+---------+
|       9999|Customer_9999|Hyderabad|Karnataka|  India|       2023-06-02|    False|
|       9998|Customer_9998|     Pune|Telangana|  India|       2023-01-27|    False|
+-----------+-------------+---------+---------+-------+-----------------+---------+
only showing top 2 rows



In [9]:
customers_df.count()

17653

In [10]:
customers_df.unpersist()

DataFrame[customer_id: string, name: string, city: string, state: string, country: string, registration_date: string, is_active: string]

In [13]:
spark.stop()