<a href="https://colab.research.google.com/github/BHARATH077/ETL_Customer_Behavior/blob/main/ETL_Customer_Behavior.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ETL Pipeline for Customer Behavior Analytics

# Data Ingestion

In [2]:
# Install Dependencies
# Install Spark + DuckDB
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark duckdb pandas seaborn matplotlib




In [3]:
# Verify Spark and DuckBD

import pyspark
from pyspark.sql import SparkSession
import duckdb

# Start Spark session
spark = SparkSession.builder.appName("ETL_Project").getOrCreate()
print("Spark Version:", spark.version)

# Connect to DuckDB (in-memory for now)
con = duckdb.connect(":memory:")
print("DuckDB connected:", con)


Spark Version: 3.5.1
DuckDB connected: <duckdb.duckdb.DuckDBPyConnection object at 0x7bc8ca713db0>


In [5]:
# Prepare Sample Data
# Since we don’t have real APIs/DBs yet, I’ll mock some CSV & JSON files directly in Colab (later we can replace them).

import pandas as pd
import json

# Mock Clickstream CSV
clickstream_data = {
    "session_id": [1,2,3,4,5],
    "customer_id": [101,102,101,103,104],
    "page_viewed": ["Home","Product","Cart","Home","Checkout"],
    "timestamp": ["2024-01-01 10:00","2024-01-01 10:05","2024-01-01 10:10","2024-01-02 12:00","2024-01-02 12:15"]
}
pd.DataFrame(clickstream_data).to_csv("clickstream.csv", index=False)

# Mock Transactions CSV
transactions_data = {
    "transaction_id": [1001,1002,1003],
    "customer_id": [101,102,104],
    "amount": [250.0, 100.0, 75.0],
    "timestamp": ["2024-01-01 11:00","2024-01-01 11:05","2024-01-02 12:30"]
}
pd.DataFrame(transactions_data).to_csv("transactions.csv", index=False)

# Mock CRM JSON
crm_data = [
    {"customer_id":101, "name":"Alice","segment":"Premium"},
    {"customer_id":102, "name":"Bob","segment":"Standard"},
    {"customer_id":103, "name":"Charlie","segment":"Standard"},
    {"customer_id":104, "name":"Diana","segment":"Premium"}
]
with open("crm.json","w") as f:
    json.dump(crm_data,f)

print("Sample data files created: clickstream.csv, transactions.csv, crm.json")


Sample data files created: clickstream.csv, transactions.csv, crm.json


In [6]:
# Read Raw Data Files
import pandas as pd
import json

# Read Clickstream CSV
clickstream_df = pd.read_csv("clickstream.csv")
print("Clickstream:")
print(clickstream_df.head())

# Read Transactions CSV
transactions_df = pd.read_csv("transactions.csv")
print("\nTransactions:")
print(transactions_df.head())

# Read CRM JSON
with open("crm.json") as f:
    crm_df = pd.json_normalize(json.load(f))
print("\nCRM:")
print(crm_df.head())


Clickstream:
   session_id  customer_id page_viewed         timestamp
0           1          101        Home  2024-01-01 10:00
1           2          102     Product  2024-01-01 10:05
2           3          101        Cart  2024-01-01 10:10
3           4          103        Home  2024-01-02 12:00
4           5          104    Checkout  2024-01-02 12:15

Transactions:
   transaction_id  customer_id  amount         timestamp
0            1001          101   250.0  2024-01-01 11:00
1            1002          102   100.0  2024-01-01 11:05
2            1003          104    75.0  2024-01-02 12:30

CRM:
   customer_id     name   segment
0          101    Alice   Premium
1          102      Bob  Standard
2          103  Charlie  Standard
3          104    Diana   Premium


In [7]:
# Load into DuckDB
import duckdb

# Create DuckDB connection
con = duckdb.connect(database=':memory:')

# Load Pandas DataFrames into DuckDB
con.register("clickstream_df", clickstream_df)
con.register("transactions_df", transactions_df)
con.register("crm_df", crm_df)

# Create DuckDB tables
con.execute("CREATE TABLE clickstream AS SELECT * FROM clickstream_df")
con.execute("CREATE TABLE transactions AS SELECT * FROM transactions_df")
con.execute("CREATE TABLE crm AS SELECT * FROM crm_df")

# Verify
print(con.execute("SHOW TABLES").fetchdf())


              name
0      clickstream
1   clickstream_df
2              crm
3           crm_df
4     transactions
5  transactions_df


In [8]:
# Run First Warehouse Queries

# Count clickstream sessions
print(con.execute("SELECT COUNT(*) as sessions FROM clickstream").fetchdf())

# Check total transactions
print(con.execute("SELECT COUNT(*) as transactions FROM transactions").fetchdf())

# Join CRM with Transactions (quick test)
query = """
SELECT c.customer_id, c.name, c.segment, SUM(t.amount) as total_spend
FROM crm c
LEFT JOIN transactions t
ON c.customer_id = t.customer_id
GROUP BY c.customer_id, c.name, c.segment
"""
print(con.execute(query).fetchdf())


   sessions
0         5
   transactions
0             3
   customer_id     name   segment  total_spend
0          102      Bob  Standard        100.0
1          101    Alice   Premium        250.0
2          104    Diana   Premium         75.0
3          103  Charlie  Standard          NaN


# Data Cleaning & Transformation with PySpark

In [9]:
# Install and import Spark

!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, upper

# Start Spark session
spark = SparkSession.builder.appName("ETL_Cleaning").getOrCreate()




In [10]:
# Load Data into Spark
# Load CSVs into Spark DataFrames
clickstream_spark = spark.read.csv("clickstream.csv", header=True, inferSchema=True)
transactions_spark = spark.read.csv("transactions.csv", header=True, inferSchema=True)

# Load JSON CRM into Spark
crm_spark = spark.read.json("crm.json")

print("Clickstream:")
clickstream_spark.show()
print("Transactions:")
transactions_spark.show()
print("CRM:")
crm_spark.show()


Clickstream:
+----------+-----------+-----------+-------------------+
|session_id|customer_id|page_viewed|          timestamp|
+----------+-----------+-----------+-------------------+
|         1|        101|       Home|2024-01-01 10:00:00|
|         2|        102|    Product|2024-01-01 10:05:00|
|         3|        101|       Cart|2024-01-01 10:10:00|
|         4|        103|       Home|2024-01-02 12:00:00|
|         5|        104|   Checkout|2024-01-02 12:15:00|
+----------+-----------+-----------+-------------------+

Transactions:
+--------------+-----------+------+-------------------+
|transaction_id|customer_id|amount|          timestamp|
+--------------+-----------+------+-------------------+
|          1001|        101| 250.0|2024-01-01 11:00:00|
|          1002|        102| 100.0|2024-01-01 11:05:00|
|          1003|        104|  75.0|2024-01-02 12:30:00|
+--------------+-----------+------+-------------------+

CRM:
+-----------+-------+--------+
|customer_id|   name| segment|

In [11]:
# Basic Cleaning

# Convert timestamps
clickstream_spark = clickstream_spark.withColumn("timestamp", to_timestamp("timestamp"))
transactions_spark = transactions_spark.withColumn("timestamp", to_timestamp("timestamp"))

# Standardize: make customer_id uppercase in CRM (simulate cleaning)
crm_spark = crm_spark.withColumn("segment", upper(col("segment")))

# Drop rows with missing customer_id
clickstream_spark = clickstream_spark.na.drop(subset=["customer_id"])
transactions_spark = transactions_spark.na.drop(subset=["customer_id"])
crm_spark = crm_spark.na.drop(subset=["customer_id"])


In [12]:
# Save Cleaned Data

import duckdb

# Collect Spark DF → Pandas → load into DuckDB
con = duckdb.connect(database=':memory:')
con.register("clickstream_clean", clickstream_spark.toPandas())
con.register("transactions_clean", transactions_spark.toPandas())
con.register("crm_clean", crm_spark.toPandas())

# Save as DuckDB tables
con.execute("CREATE TABLE clickstream_clean AS SELECT * FROM clickstream_clean")
con.execute("CREATE TABLE transactions_clean AS SELECT * FROM transactions_clean")
con.execute("CREATE TABLE crm_clean AS SELECT * FROM crm_clean")

print(con.execute("SHOW TABLES").fetchdf())


                 name
0   clickstream_clean
1   clickstream_clean
2           crm_clean
3           crm_clean
4  transactions_clean
5  transactions_clean


In [13]:
# Verify Cleaning

query = """
SELECT c.customer_id, c.name, c.segment, COUNT(s.session_id) as sessions, SUM(t.amount) as total_spend
FROM crm_clean c
LEFT JOIN clickstream_clean s ON c.customer_id = s.customer_id
LEFT JOIN transactions_clean t ON c.customer_id = t.customer_id
GROUP BY c.customer_id, c.name, c.segment
"""
print(con.execute(query).fetchdf())


   customer_id     name   segment  sessions  total_spend
0          104    Diana   PREMIUM         1         75.0
1          103  Charlie  STANDARD         1          NaN
2          101    Alice   PREMIUM         2        500.0
3          102      Bob  STANDARD         1        100.0
