[Reference](https://medium.com/towards-data-engineering/data-engineering-for-ml-building-a-customer-churn-prediction-pipeline-with-airflow-f6b50daf5443)

# Step 1: Data Extraction

In [1]:
# project_folder/scripts/extract_data.py
import pandas as pd
import numpy as np

def extract_data():
    data = {
        'customer_id': range(1, 31),
        'age': [25, 45, 34, 23, 30, 28, 37, np.nan, 45, 24, 31, 29, 41, 35, 44, 27, 32, 26, 49, 36, 22, 39, 43, 21, 48, 33, 40, 38, 27, 55],
        'location': ['NY', 'CA', 'TX', 'NY', 'CA', 'TX', 'NY', 'CA', 'TX', 'NY', 'CA', 'TX', 'NY', 'CA', 'TX', 'NY', 'CA', 'TX', 'NY', 'CA',
                     'NY', 'CA', 'TX', 'NY', np.nan, 'TX', 'NY', 'CA', 'TX', 'NY'],
        'last_login': ['2024-10-05', '2024-10-07', '2024-09-30', '2024-10-01', '2024-10-06',
                       '2024-09-15', '2024-09-10', '2024-10-03', '2024-10-02', '2024-10-04',
                       '2024-09-25', '2024-09-20', '2024-10-05', '2024-10-07', '2024-09-30',
                       '2024-10-01', '2024-09-28', '2024-09-22', '2024-09-25', '2024-10-02',
                       '2024-10-05', '2024-10-07', '2024-09-30', '2024-10-01', '2024-10-06',
                       '2024-09-28', '2024-10-04', '2024-10-08', '2024-09-10', '2024-09-18'],
        'num_logins': [10, 20, 15, 8, 5, 3, 25, 40, 18, 22, 9, 7, np.nan, 35, 12, 6, 8, 15, 21, 10,
                       12, 17, 6, 8, 14, 9, 19, 23, 11, 18]
    }
    df = pd.DataFrame(data)
    df.to_csv('<parent_path>/project_folder/tmp/customer_data.csv', index=False)

# Step 2: Data Cleaning & Transformation


In [2]:
# project_folder/scripts/transform_data.py
import pandas as pd

def transform_data():
    df = pd.read_csv('<parent_path>/project_folder/tmp/customer_data.csv')

    # Data Cleaning
    age_mean = df['age'].mean()
    df['age'].fillna(age_mean, inplace=True)
    df['location'].fillna('Unknown', inplace=True)
    df['num_logins'].fillna(0, inplace=True)

    # Feature Engineering
    df['last_login'] = pd.to_datetime(df['last_login'])
    df['days_since_login'] = (pd.to_datetime('today') - df['last_login']).dt.days

    # Define churn based on more complex patterns ie. days_since_login>40  or num_logins <10
    # This label is used for training only
    df['churn'] = ((df['days_since_login'] > 40) | (df['num_logins'] < 10)).astype(int)
    df.to_csv('<parent_path>/project_folder/tmp/transformed_customer_data.csv', index=False)

# Step 3: Model Training

In [3]:
# project_folder/scripts/train_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import joblib

def train_model():
    df = pd.read_csv('<parent_path>/project_folder/tmp/transformed_customer_data.csv')

    # Features and target
    X = df[['age', 'days_since_login', 'num_logins']]
    y = df['churn']

    # Split data in training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Train model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Save model to pkl file
    joblib.dump(model, '<parent_path>/project_folder/tmp/churn_model.pkl')

    # Print model accuracy and f1 score
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    print(f"Model Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")

# Step 4: Prediction

In [4]:
# project_folder/scripts/predict.py
import pandas as pd
import joblib

def predict():
    # Load the trained model
    model = joblib.load('<parent_path>/project_folder/tmp/churn_model.pkl')

    # New data for prediction
    new_data = pd.DataFrame({
        'age': [29, 52, 41, 23, 45, 30, 39, 40, 22, 50],
        'days_since_login': [12, 45, 22, 10, 60, 35, 5, 90, 13, 20],
        'num_logins': [18, 3, 7, 25, 1, 15, 20, 2, 17, 8]
    })

    # Make predictions
    predictions = model.predict(new_data)


    # Display results and save to file
    output = pd.DataFrame({
        'customer_id': range(31, 41),
        'age': new_data['age'],
        'days_since_login': new_data['days_since_login'],
        'num_logins': new_data['num_logins'],
        'predicted_churn': predictions  # Adding predictions as a new column
    })

    # Save predictions to a CSV file
    output.to_csv('<parent_path>/project_folder/tmp/predicted_churn.csv', index=False)

    # Optional: Print predictions for verification
    for i, pred in enumerate(predictions):
        print(f"Customer {i+1} Churn Prediction: {'Churn' if pred == 1 else 'Not Churn'}")

# Step 5: Scheduling the Pipeline with Airflow

In [6]:
# project_folder/dags/churn_prediction_dag.py
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime

# Import functions from the scripts
from scripts.extract_data import extract_data
from scripts.transform_data import transform_data
from scripts.train_model import train_model
from scripts.predict import predict

# Default DAG arguments
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
}

# Initialize the DAG
with DAG(
    'customer_churn_prediction',
    default_args=default_args,
    schedule_interval='@daily',
    start_date=datetime(2024, 11, 9),
    catchup=False
) as dag:
    # Start node
    start = DummyOperator(task_id='start')

    # Define tasks
    task_extract_data = PythonOperator(
        task_id='extract_data',
        python_callable=extract_data
    )

    task_transform_data = PythonOperator(
        task_id='transform_data',
        python_callable=transform_data
    )

    task_train_model = PythonOperator(
        task_id='train_model',
        python_callable=train_model
    )

    task_predict = PythonOperator(
        task_id='predict',
        python_callable=predict
    )

    # End node
    end = DummyOperator(task_id='end')

    # Set task dependencies
    start >> task_extract_data >> task_transform_data >> task_train_model >> task_predict >> end

# Project Structure

```
project_folder
├── __init__.py
├── dags
│   └── churn_prediction_dag.py       # The main Airflow DAG
├── scripts
|   ├── __init__.py                   # Add an empty __init__.py file in both the project_folder and scripts directories. This will make them Python packages.
│   ├── extract_data.py               # Script for data extraction
│   ├── transform_data.py             # Script for data transformation
│   ├── train_model.py                # Script for model training
│   └── predict.py                    # Script for making predictions on new data
└── tmp
    ├── customer_data.csv             # Intermediate data file after extraction
    ├── transformed_customer_data.csv # Transformed data file
    └── churn_model.pkl               # Trained model file
```