
<!-- Feature Engineering based on EDA insights.

We create behavioural and engagement features that capture:
 -Spending intensity 
 -Credit usage
 -Customer tenure
 -Low engagement signals
 -Customer dissatisfaction signals

These features help models detect early churn behaviour. -->



In [11]:
"""
Feature Engineering based on EDA insights.

We create behavioural and engagement features that capture:
• Spending intensity
• Credit usage
• Customer tenure
• Low engagement signals
• Customer dissatisfaction signals

These features help models detect early churn behaviour.
"""


'\nFeature Engineering based on EDA insights.\n\nWe create behavioural and engagement features that capture:\n• Spending intensity\n• Credit usage\n• Customer tenure\n• Low engagement signals\n• Customer dissatisfaction signals\n\nThese features help models detect early churn behaviour.\n'

In [12]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv("../../../data/raw/BankChurners.csv")

##basic data cleaning

df['churn'] = df['Attrition_Flag'].map({
    'Existing Customer':0,
    'Attrited Customer':1
})

df = df.drop([
    'CLIENTNUM',
    'Attrition_Flag',
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'
], axis=1)

# ----- Behavioural ratios -----

# Average spend per transaction
df['avg_spend_per_txn'] = df['Total_Trans_Amt'] / df['Total_Trans_Ct']

# Credit usage ratio (better than raw revolving balance)
df['credit_used_ratio'] = df['Total_Revolving_Bal'] / df['Credit_Limit']

# Tenure in years (more interpretable than months)
df['tenure_years'] = df['Months_on_book'] / 12

# Low engagement flag
df['low_txn_flag'] = (df['Total_Trans_Ct'] < 50).astype(int)

# High inactivity flag
df['high_inactive_flag'] = (df['Months_Inactive_12_mon'] >= 3).astype(int)

# High contact flag (possible dissatisfaction)
df['high_contact_flag'] = (df['Contacts_Count_12_mon'] >= 3).astype(int)

df = pd.get_dummies(df, drop_first=True)

## TRAIN-TEST SPLIT

In [14]:
from sklearn.model_selection import train_test_split

X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Scaling

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Save processed data

In [16]:
import joblib

joblib.dump(scaler, "../../../data/processed/scaler.pkl")

pd.DataFrame(X_train_scaled, columns=X.columns).to_csv("../../../data/processed/X_train.csv", index=False)
pd.DataFrame(X_test_scaled, columns=X.columns).to_csv("../../../data/processed/X_test.csv", index=False)

y_train.to_csv("../../../data/processed/y_train.csv", index=False)
y_test.to_csv("../../../data/processed/y_test.csv", index=False)


In [18]:
# After full processing, before split

df_full_processed = df.copy()

df_full_processed.to_csv("../../../data/processed/full_processed_data.csv", index=False)