<a href="https://colab.research.google.com/github/BHARATH077/ETL_Customer_Behavior/blob/main/customer_etl_dag_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Airflow Orchestration (Simulated)

# Step 1: DAG Setup
### Airflow uses Directed Acyclic Graphs (DAGs) — a set of tasks that run in order.
### We’ll define:
- Extract → Load raw CSV data.
- Transform → Clean & enrich Customer 360.
- Load → Save CSV for dashboarding.

In [None]:
#Step 2: Create DAG Script

from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime
import pandas as pd

# ---- ETL Task Functions ----
def extract():
    # In real scenario: load from API, DB, etc.
    clickstream = pd.read_csv("clickstream.csv")
    transactions = pd.read_csv("transactions.csv")
    crm = pd.read_csv("crm.csv")
    clickstream.to_csv("/tmp/clickstream_raw.csv", index=False)
    transactions.to_csv("/tmp/transactions_raw.csv", index=False)
    crm.to_csv("/tmp/crm_raw.csv", index=False)

def transform():
    clickstream = pd.read_csv("/tmp/clickstream_raw.csv")
    transactions = pd.read_csv("/tmp/transactions_raw.csv")
    crm = pd.read_csv("/tmp/crm_raw.csv")

    # Example transformation: aggregate spend
    spend = transactions.groupby("customer_id")["amount"].sum().reset_index()
    sessions = clickstream.groupby("customer_id")["session_id"].nunique().reset_index()

    customer360 = crm.merge(spend, on="customer_id", how="left").merge(
        sessions, on="customer_id", how="left"
    )
    customer360.to_csv("/tmp/customer360.csv", index=False)

def load():
    df = pd.read_csv("/tmp/customer360.csv")
    df.to_csv("customer360_final.csv", index=False)  # for BI dashboard

# ---- DAG Definition ----
default_args = {
    "owner": "airflow",
    "start_date": datetime(2024, 1, 1),
    "retries": 1,
}

with DAG(
    dag_id="customer_etl_pipeline",
    default_args=default_args,
    schedule_interval="@daily",
    catchup=False,
) as dag:

    task_extract = PythonOperator(
        task_id="extract", python_callable=extract
    )
    task_transform = PythonOperator(
        task_id="transform", python_callable=transform
    )
    task_load = PythonOperator(
        task_id="load", python_callable=load
    )

    task_extract >> task_transform >> task_load


In [None]:
# Step 3: Simulate DAG (Locally in Colab)
# Simulate Airflow tasks manually
extract()
transform()
load()

# Check final output
df = pd.read_csv("customer360_final.csv")
print(df.head())
