# Customer Data Analysis

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('CustomerDataProcessing').getOrCreate()

spark

25/08/29 18:47:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
!hadoop fs -ls -h /tmp/

Found 9 items
drwxr-xr-x   - root            hadoop          0 2025-08-22 17:03 /tmp/active_cities.csv
-rw-r--r--   2 sekar_dhana8644 hadoop     10.0 M 2025-08-21 18:37 /tmp/customers_10mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop      1.0 M 2025-08-21 18:37 /tmp/customers_1mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop      5.4 K 2025-08-21 18:40 /tmp/first_100_customers.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-13 21:45 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-13 21:45 /tmp/hive
-rw-r--r--   2 root            hadoop         83 2025-08-19 13:33 /tmp/input.txt
-rw-r--r--   2 sekar_dhana8644 hadoop    843.1 K 2025-08-28 20:55 /tmp/orders.csv
drwxr-xr-x   - root            hadoop          0 2025-08-29 18:46 /tmp/tables


In [3]:
customer_df = spark.read.format('csv').option('header', 'true').load('/tmp/customers_1mb.csv')
customer_df.show(5)

                                                                                

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    False|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     True|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     True|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    False|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    False|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 5 rows



In [4]:
customer_df.printSchema() 

root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- is_active: string (nullable = true)



In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [6]:
customer_df = customer_df.withColumn('registration_date', to_date(col('registration_date'), 'yyyy-MM-dd'))\
    .withColumn('is_active', col('is_active').cast('boolean'))\
    .withColumn('customer_id', col('customer_id').cast(IntegerType()))

In [7]:
customer_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



In [8]:
customer_df = customer_df.fillna({'name':'Unknown', 'city':'Unknown', 'state':'Unknown', 'country': 'Unknown'})

In [9]:
customer_df = customer_df.withColumn('registration_year', year(col('registration_date')))\
    .withColumn('registration_month', month(col('registration_date')))

In [10]:
customer_df.show(2)

[Stage 2:>                                                          (0 + 1) / 1]

+-----------+----------+---------+-----------+-------+-----------------+---------+-----------------+------------------+
|customer_id|      name|     city|      state|country|registration_date|is_active|registration_year|registration_month|
+-----------+----------+---------+-----------+-------+-----------------+---------+-----------------+------------------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|             2023|                 6|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|             2023|                12|
+-----------+----------+---------+-----------+-------+-----------------+---------+-----------------+------------------+
only showing top 2 rows



                                                                                

In [11]:
unique_cities = customer_df.select('city').distinct()
unique_cities.show()

+---------+
|     city|
+---------+
|    Delhi|
|  Kolkata|
|Hyderabad|
|Bangalore|
|Ahmedabad|
|  Chennai|
|   Mumbai|
|     Pune|
+---------+



In [12]:
unique_states = customer_df.select('state').distinct()
unique_states.show()

+-----------+
|      state|
+-----------+
|    Gujarat|
|      Delhi|
|  Karnataka|
|  Telangana|
|Maharashtra|
| Tamil Nadu|
|West Bengal|
+-----------+



In [13]:
unique_countries = customer_df.select('country').distinct()
unique_countries.show()

+-------+
|country|
+-------+
|  India|
+-------+



In [14]:
customer_df.groupBy('city').count().orderBy(col('count').desc()).show()

+---------+-----+
|     city|count|
+---------+-----+
|     Pune| 2243|
|Hyderabad| 2242|
|  Kolkata| 2223|
|Bangalore| 2211|
|    Delhi| 2200|
|Ahmedabad| 2198|
|  Chennai| 2194|
|   Mumbai| 2142|
+---------+-----+



In [15]:
customer_df.groupBy('state', 'country').count().orderBy(col('count').desc()).show()

+-----------+-------+-----+
|      state|country|count|
+-----------+-------+-----+
|      Delhi|  India| 2578|
|    Gujarat|  India| 2543|
| Tamil Nadu|  India| 2536|
|  Telangana|  India| 2520|
|West Bengal|  India| 2503|
|Maharashtra|  India| 2490|
|  Karnataka|  India| 2483|
+-----------+-------+-----+



In [16]:
# Count of active and inactive customer per state

customer_df.groupBy('state').pivot('is_active').count().show()

+-----------+-----+----+
|      state|false|true|
+-----------+-----+----+
|    Gujarat| 1211|1332|
|      Delhi| 1356|1222|
|  Karnataka| 1207|1276|
|  Telangana| 1294|1226|
|Maharashtra| 1260|1230|
| Tamil Nadu| 1284|1252|
|West Bengal| 1306|1197|
+-----------+-----+----+



In [17]:
from pyspark.sql.window import Window

In [18]:
window_spec = Window.partitionBy('state').orderBy(col('registration_date').desc())

In [19]:
customer_df = customer_df.withColumn('rank', rank().over(window_spec))\
    .withColumn('dense_rank', dense_rank().over(window_spec))\
    .withColumn('row_number', row_number().over(window_spec))

In [20]:
customer_df.show(15)

+-----------+--------------+---------+-----+-------+-----------------+---------+-----------------+------------------+----+----------+----------+
|customer_id|          name|     city|state|country|registration_date|is_active|registration_year|registration_month|rank|dense_rank|row_number|
+-----------+--------------+---------+-----+-------+-----------------+---------+-----------------+------------------+----+----------+----------+
|         61|   Customer_61|Hyderabad|Delhi|  India|       2023-12-31|    false|             2023|                12|   1|         1|         1|
|        501|  Customer_501|   Mumbai|Delhi|  India|       2023-12-31|    false|             2023|                12|   1|         1|         2|
|       2763| Customer_2763|     Pune|Delhi|  India|       2023-12-31|     true|             2023|                12|   1|         1|         3|
|      12858|Customer_12858|Ahmedabad|Delhi|  India|       2023-12-31|     true|             2023|                12|   1|        

In [21]:
recent_customers = customer_df.filter(col('registration_date') >= ('2023-07-01'))
recent_customers.count()

9025

In [22]:
customer_df.count()

17653

In [23]:
# oldest and newest customers per city

customer_df.groupBy('city').agg(min('registration_date').alias('oldest'),max('registration_date').alias('newest')).show()

+---------+----------+----------+
|     city|    oldest|    newest|
+---------+----------+----------+
|    Delhi|2023-01-01|2023-12-31|
|  Kolkata|2023-01-01|2023-12-31|
|Hyderabad|2023-01-01|2023-12-31|
|Bangalore|2023-01-01|2023-12-31|
|Ahmedabad|2023-01-01|2023-12-31|
|  Chennai|2023-01-01|2023-12-31|
|   Mumbai|2023-01-01|2023-12-31|
|     Pune|2023-01-01|2023-12-31|
+---------+----------+----------+



In [24]:
output_file_path = '/data/tables/processed_customers'
customer_df.write.mode('overwrite').parquet(output_file_path)

                                                                                

In [47]:
!hadoop fs -ls -h /data/tables

Found 2 items
drwxr-xr-x   - root hadoop          0 2025-08-29 18:49 /data/tables/final_customer_orders
drwxr-xr-x   - root hadoop          0 2025-08-29 18:47 /data/tables/processed_customers


# Order Data Analysis

In [26]:
!hadoop fs -ls /tmp/

Found 9 items
drwxr-xr-x   - root            hadoop          0 2025-08-22 17:03 /tmp/active_cities.csv
-rw-r--r--   2 sekar_dhana8644 hadoop   10528211 2025-08-21 18:37 /tmp/customers_10mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop    1060750 2025-08-21 18:37 /tmp/customers_1mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop       5488 2025-08-21 18:40 /tmp/first_100_customers.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-13 21:45 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-13 21:45 /tmp/hive
-rw-r--r--   2 root            hadoop         83 2025-08-19 13:33 /tmp/input.txt
-rw-r--r--   2 sekar_dhana8644 hadoop     863301 2025-08-28 20:55 /tmp/orders.csv
drwxr-xr-x   - root            hadoop          0 2025-08-29 18:46 /tmp/tables


In [27]:
order_df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('/tmp/orders.csv')
order_df.show(5)

                                                                                

+--------+-----------+----------+-----------------+---------+
|order_id|customer_id|order_date|     total_amount|   status|
+--------+-----------+----------+-----------------+---------+
|       0|       3692|2024-09-03|547.7160076008001|  Shipped|
|       1|      11055|2024-08-10|577.8942599188381|  Pending|
|       2|       6963|2024-08-22|484.2085562764487|  Pending|
|       3|      13268|2024-09-01|366.3286882431848|Cancelled|
|       4|       1131|2024-08-09|896.9588380686909|  Pending|
+--------+-----------+----------+-----------------+---------+
only showing top 5 rows



In [28]:
order_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- status: string (nullable = true)



In [29]:
unique_status = order_df.select('status').distinct()
unique_status.show()

+---------+
|   status|
+---------+
|Cancelled|
|Delivered|
|  Shipped|
|  Pending|
+---------+



In [30]:
order_df.groupBy('status').count().show()

+---------+-----+
|   status|count|
+---------+-----+
|Cancelled| 4469|
|Delivered| 4341|
|  Shipped| 4386|
|  Pending| 4457|
+---------+-----+



In [32]:
order_df = order_df.withColumn('order_year', year(col('order_date')))\
    .withColumn('order_month', month(col('order_date')))
order_df.show(5)

+--------+-----------+----------+-----------------+---------+----------+-----------+
|order_id|customer_id|order_date|     total_amount|   status|order_year|order_month|
+--------+-----------+----------+-----------------+---------+----------+-----------+
|       0|       3692|2024-09-03|547.7160076008001|  Shipped|      2024|          9|
|       1|      11055|2024-08-10|577.8942599188381|  Pending|      2024|          8|
|       2|       6963|2024-08-22|484.2085562764487|  Pending|      2024|          8|
|       3|      13268|2024-09-01|366.3286882431848|Cancelled|      2024|          9|
|       4|       1131|2024-08-09|896.9588380686909|  Pending|      2024|          8|
+--------+-----------+----------+-----------------+---------+----------+-----------+
only showing top 5 rows



In [33]:
order_df.groupBy('order_month').count().orderBy(col('order_month')).show()

+-----------+-----+
|order_month|count|
+-----------+-----+
|          1| 1499|
|          2| 1368|
|          3| 1539|
|          4| 1457|
|          5| 1518|
|          6| 1455|
|          7| 1517|
|          8| 1472|
|          9| 1446|
|         10| 1513|
|         11| 1426|
|         12| 1443|
+-----------+-----+



In [34]:
order_df.groupBy('order_month').sum('total_amount').withColumn('sum(total_amount)',round('sum(total_amount)',2)).orderBy('order_month').show()

+-----------+-----------------+
|order_month|sum(total_amount)|
+-----------+-----------------+
|          1|        751287.98|
|          2|        670368.38|
|          3|        774089.94|
|          4|        721995.67|
|          5|        759871.48|
|          6|        750097.65|
|          7|        765183.34|
|          8|        744140.31|
|          9|         728613.1|
|         10|        777432.36|
|         11|        718729.46|
|         12|         722282.5|
+-----------+-----------------+



In [35]:
order_df.groupBy('order_month').avg('total_amount').withColumn('avg(total_amount)',round('avg(total_amount)',2)).orderBy('order_month').show()

+-----------+-----------------+
|order_month|avg(total_amount)|
+-----------+-----------------+
|          1|           501.19|
|          2|           490.04|
|          3|           502.98|
|          4|           495.54|
|          5|           500.57|
|          6|           515.53|
|          7|           504.41|
|          8|           505.53|
|          9|           503.88|
|         10|           513.84|
|         11|           504.02|
|         12|           500.54|
+-----------+-----------------+



In [36]:
order_df.groupBy('status').sum('total_amount').withColumn('sum(total_amount)',round('sum(total_amount)',2)).show()

+---------+-----------------+
|   status|sum(total_amount)|
+---------+-----------------+
|Cancelled|       2237958.01|
|Delivered|       2210383.53|
|  Shipped|        2188889.0|
|  Pending|       2246861.63|
+---------+-----------------+



In [37]:
status_count_per_month = order_df.groupBy('order_month', 'status').agg(count('*').alias('status_count')).orderBy('order_month')
status_count_per_month.show()

+-----------+---------+------------+
|order_month|   status|status_count|
+-----------+---------+------------+
|          1|Cancelled|         369|
|          1|  Pending|         383|
|          1|  Shipped|         368|
|          1|Delivered|         379|
|          2|Cancelled|         345|
|          2|  Pending|         328|
|          2|  Shipped|         356|
|          2|Delivered|         339|
|          3|  Shipped|         384|
|          3|Cancelled|         395|
|          3|  Pending|         399|
|          3|Delivered|         361|
|          4|  Pending|         368|
|          4|Cancelled|         369|
|          4|Delivered|         378|
|          4|  Shipped|         342|
|          5|Cancelled|         375|
|          5|Delivered|         405|
|          5|  Shipped|         384|
|          5|  Pending|         354|
+-----------+---------+------------+
only showing top 20 rows



In [38]:
order_df.groupBy('order_month').pivot('status').count().orderBy('order_month').show()

+-----------+---------+---------+-------+-------+
|order_month|Cancelled|Delivered|Pending|Shipped|
+-----------+---------+---------+-------+-------+
|          1|      369|      379|    383|    368|
|          2|      345|      339|    328|    356|
|          3|      395|      361|    399|    384|
|          4|      369|      378|    368|    342|
|          5|      375|      405|    354|    384|
|          6|      374|      350|    385|    346|
|          7|      399|      382|    383|    353|
|          8|      386|      356|    362|    368|
|          9|      351|      333|    370|    392|
|         10|      391|      365|    381|    376|
|         11|      376|      332|    361|    357|
|         12|      339|      361|    383|    360|
+-----------+---------+---------+-------+-------+



# Joining and Analysing Customers and Orders Data

In [39]:
customer_order_df = customer_df.join(order_df, 'customer_id', 'inner')
customer_order_df.limit(5).toPandas()

Unnamed: 0,customer_id,name,city,state,country,registration_date,is_active,registration_year,registration_month,rank,dense_rank,row_number,order_id,order_date,total_amount,status,order_year,order_month
0,61,Customer_61,Hyderabad,Delhi,India,2023-12-31,False,2023,12,1,1,1,17283,2024-01-21,171.916947,Pending,2024,1
1,61,Customer_61,Hyderabad,Delhi,India,2023-12-31,False,2023,12,1,1,1,17058,2024-08-25,605.956729,Pending,2024,8
2,61,Customer_61,Hyderabad,Delhi,India,2023-12-31,False,2023,12,1,1,1,13508,2024-05-29,492.687016,Pending,2024,5
3,61,Customer_61,Hyderabad,Delhi,India,2023-12-31,False,2023,12,1,1,1,2610,2024-07-09,535.024579,Delivered,2024,7
4,501,Customer_501,Mumbai,Delhi,India,2023-12-31,False,2023,12,1,1,2,16373,2024-10-29,944.536265,Pending,2024,10


In [40]:
# Total orders per customer

customer_order_df.groupBy('customer_id').count().orderBy(col('count').desc()).show(10)

+-----------+-----+
|customer_id|count|
+-----------+-----+
|      11776|    7|
|       4294|    6|
|       3884|    6|
|       7566|    6|
|       3336|    6|
|       5160|    6|
|      13034|    6|
|      14838|    6|
|       3243|    6|
|      11537|    5|
+-----------+-----+
only showing top 10 rows



In [41]:
# Total order and amount per customer
# customer with high total spent and low order frequency

customer_total_order_spent = customer_order_df.groupBy('customer_id')\
    .agg(count('*').alias('count'), sum('total_amount').alias('total_amount'))\
    .withColumn('total_amount', round('total_amount',2))\
    .orderBy(col('total_amount').desc())
customer_total_order_spent.show(10)

+-----------+-----+------------+
|customer_id|count|total_amount|
+-----------+-----+------------+
|       3336|    6|     4362.55|
|       3884|    6|      4188.0|
|      16020|    5|     3967.27|
|      14372|    5|     3961.79|
|      14933|    5|     3828.58|
|       7566|    6|     3647.12|
|      10559|    5|     3548.84|
|      11776|    7|     3438.37|
|      11449|    4|     3396.06|
|       5425|    5|     3389.16|
+-----------+-----+------------+
only showing top 10 rows



In [42]:
# Total order and average amount per customer

customer_order_df.groupBy('customer_id')\
    .agg(count('*').alias('count'), avg('total_amount').alias('avg_amount'))\
    .withColumn('avg_amount', round('avg_amount',2))\
    .orderBy(col('count').desc(),col('avg_amount').desc()).show(10)

+-----------+-----+----------+
|customer_id|count|avg_amount|
+-----------+-----+----------+
|      11776|    7|     491.2|
|       3336|    6|    727.09|
|       3884|    6|     698.0|
|       7566|    6|    607.85|
|      13034|    6|     532.5|
|      14838|    6|    482.39|
|       3243|    6|     476.7|
|       4294|    6|     303.6|
|       5160|    6|    276.12|
|      16020|    5|    793.45|
+-----------+-----+----------+
only showing top 10 rows



In [43]:
window_spec = Window.orderBy(col('total_amount').desc())

ranked_customer = customer_total_order_spent.withColumn('dense_rank', dense_rank().over(window_spec))
ranked_customer.show(10)

25/08/29 18:49:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 18:49:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 18:49:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 18:49:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 18:49:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----------+-----+------------+----------+
|customer_id|count|total_amount|dense_rank|
+-----------+-----+------------+----------+
|       3336|    6|     4362.55|         1|
|       3884|    6|      4188.0|         2|
|      16020|    5|     3967.27|         3|
|      14372|    5|     3961.79|         4|
|      14933|    5|     3828.58|         5|
|       7566|    6|     3647.12|         6|
|      10559|    5|     3548.84|         7|
|      11776|    7|     3438.37|         8|
|      11449|    4|     3396.06|         9|
|       5425|    5|     3389.16|        10|
+-----------+-----+------------+----------+
only showing top 10 rows



25/08/29 18:49:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 18:49:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 18:49:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [44]:
!hadoop fs -ls /data/

Found 7 items
-rw-r--r--   2 root hadoop       5488 2025-08-23 17:19 /data/customers_100.csv
-rw-r--r--   2 root hadoop        280 2025-08-24 17:21 /data/dates_data.csv
drwxr-xr-x   - root hadoop          0 2025-08-25 16:48 /data/external_data
drwxr-xr-x   - root hadoop          0 2025-08-29 18:47 /data/tables
drwxr-xr-x   - root hadoop          0 2025-08-23 19:30 /data/write_output.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:46 /data/write_output_delimiter.csv
drwxr-xr-x   - root hadoop          0 2025-08-23 19:42 /data/write_output_repartition.csv


In [45]:
output_file_path = '/data/tables/final_customer_orders'
customer_order_df.write.mode('overwrite').parquet(output_file_path)

In [46]:
!hadoop fs -ls /data/tables/

Found 2 items
drwxr-xr-x   - root hadoop          0 2025-08-29 18:49 /data/tables/final_customer_orders
drwxr-xr-x   - root hadoop          0 2025-08-29 18:47 /data/tables/processed_customers
