# Customer Data Processing

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('CustomerDataProcessing').getOrCreate()

spark

In [5]:
!hadoop fs -ls -h /tmp/

Found 8 items
drwxr-xr-x   - root            hadoop          0 2025-08-22 17:03 /tmp/active_cities.csv
-rw-r--r--   2 sekar_dhana8644 hadoop     10.0 M 2025-08-21 18:37 /tmp/customers_10mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop      1.0 M 2025-08-21 18:37 /tmp/customers_1mb.csv
-rw-r--r--   2 sekar_dhana8644 hadoop      5.4 K 2025-08-21 18:40 /tmp/first_100_customers.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-13 21:45 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-13 21:45 /tmp/hive
-rw-r--r--   2 root            hadoop         83 2025-08-19 13:33 /tmp/input.txt
-rw-r--r--   2 sekar_dhana8644 hadoop    843.1 K 2025-08-28 20:55 /tmp/orders.csv


In [7]:
customer_df = spark.read.format('csv').option('header', 'true').load('/tmp/customers_1mb.csv')
customer_df.show(5)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    False|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     True|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     True|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    False|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    False|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 5 rows



In [9]:
customer_df.printSchema() 

root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- is_active: string (nullable = true)



In [18]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [19]:
customer_df = customer_df.withColumn('registration_date', to_date(col('registration_date'), 'yyyy-MM-dd'))\
    .withColumn('is_active', col('is_active').cast('boolean'))\
    .withColumn('customer_id', col('customer_id').cast(IntegerType()))

In [20]:
customer_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



In [21]:
customer_df = customer_df.fillna({'name':'Unknown', 'city':'Unknown', 'state':'Unknown', 'country': 'Unknown'})

In [23]:
customer_df = customer_df.withColumn('registration_year', year(col('registration_date')))\
    .withColumn('registration_month', month(col('registration_date')))

In [24]:
customer_df.show(2)

+-----------+----------+---------+-----------+-------+-----------------+---------+-----------------+------------------+
|customer_id|      name|     city|      state|country|registration_date|is_active|registration_year|registration_month|
+-----------+----------+---------+-----------+-------+-----------------+---------+-----------------+------------------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|             2023|                 6|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|             2023|                12|
+-----------+----------+---------+-----------+-------+-----------------+---------+-----------------+------------------+
only showing top 2 rows



In [26]:
unique_cities = customer_df.select(countDistinct('city')).collect()
unique_cities[0][0]

8

In [27]:
unique_states = customer_df.select(countDistinct('state')).collect()
unique_states[0][0]

7

In [28]:
unique_countries = customer_df.select(countDistinct('country')).collect()
unique_countries[0][0]

1

In [30]:
customer_df.groupBy('city').count().orderBy(col('count').desc()).show()

+---------+-----+
|     city|count|
+---------+-----+
|     Pune| 2243|
|Hyderabad| 2242|
|  Kolkata| 2223|
|Bangalore| 2211|
|    Delhi| 2200|
|Ahmedabad| 2198|
|  Chennai| 2194|
|   Mumbai| 2142|
+---------+-----+



In [33]:
customer_df.groupBy('state', 'country').count().orderBy(col('count').desc()).show()

+-----------+-------+-----+
|      state|country|count|
+-----------+-------+-----+
|      Delhi|  India| 2578|
|    Gujarat|  India| 2543|
| Tamil Nadu|  India| 2536|
|  Telangana|  India| 2520|
|West Bengal|  India| 2503|
|Maharashtra|  India| 2490|
|  Karnataka|  India| 2483|
+-----------+-------+-----+



In [35]:
# Count of active and inactive customer per state

customer_df.groupBy('state').pivot('is_active').count().show()

+-----------+-----+----+
|      state|false|true|
+-----------+-----+----+
|    Gujarat| 1211|1332|
|      Delhi| 1356|1222|
|  Karnataka| 1207|1276|
|  Telangana| 1294|1226|
|Maharashtra| 1260|1230|
| Tamil Nadu| 1284|1252|
|West Bengal| 1306|1197|
+-----------+-----+----+



In [40]:
from pyspark.sql.window import Window

In [44]:
window_spec = Window.partitionBy('state').orderBy(col('registration_date').desc())

In [45]:
customer_df = customer_df.withColumn('rank', rank().over(window_spec))\
    .withColumn('dense_rank', dense_rank().over(window_spec))\
    .withColumn('row_number', row_number().over(window_spec))

In [47]:
customer_df.show(15)

+-----------+--------------+---------+-----+-------+-----------------+---------+-----------------+------------------+----+----------+----------+
|customer_id|          name|     city|state|country|registration_date|is_active|registration_year|registration_month|rank|dense_rank|row_number|
+-----------+--------------+---------+-----+-------+-----------------+---------+-----------------+------------------+----+----------+----------+
|         61|   Customer_61|Hyderabad|Delhi|  India|       2023-12-31|    false|             2023|                12|   1|         1|         1|
|        501|  Customer_501|   Mumbai|Delhi|  India|       2023-12-31|    false|             2023|                12|   1|         1|         2|
|       2763| Customer_2763|     Pune|Delhi|  India|       2023-12-31|     true|             2023|                12|   1|         1|         3|
|      12858|Customer_12858|Ahmedabad|Delhi|  India|       2023-12-31|     true|             2023|                12|   1|        

In [50]:
recent_customers = customer_df.filter(col('registration_date') >= ('2023-07-01'))
recent_customers.count()

9025

In [51]:
customer_df.count()

17653

In [52]:
# oldest and newest customers per city

customer_df.groupBy('city').agg(min('registration_date').alias('oldest'),max('registration_date').alias('newest')).show()

+---------+----------+----------+
|     city|    oldest|    newest|
+---------+----------+----------+
|    Delhi|2023-01-01|2023-12-31|
|  Kolkata|2023-01-01|2023-12-31|
|Hyderabad|2023-01-01|2023-12-31|
|Bangalore|2023-01-01|2023-12-31|
|Ahmedabad|2023-01-01|2023-12-31|
|  Chennai|2023-01-01|2023-12-31|
|   Mumbai|2023-01-01|2023-12-31|
|     Pune|2023-01-01|2023-12-31|
+---------+----------+----------+



In [53]:
output_file_path = '/tmp/tables/processed_customers'
customer_df.write.mode('overwrite').parquet(output_file_path)

                                                                                

In [57]:
!hadoop fs -ls -h /tmp/tables

Found 1 items
drwxr-xr-x   - root hadoop          0 2025-08-28 21:51 /tmp/tables/processed_customers
