# Credit Card Churn Prediction - ML Assessment
## Tasks 1-4: Data Understanding, Feature Engineering, Pattern Discovery, and Model Development

## Task 1: Data Understanding (15 minutes)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Load dataset
df = pd.read_csv('Dataset(BankChurners)_CampusHiring_Dec2025(dataset).csv')

print("="*60)
print("DATA UNDERSTANDING REPORT")
print("="*60)

# Basic information
print(f"\n1. Dataset Shape: {df.shape}")
print(f"   - Total Records: {df.shape[0]:,}")
print(f"   - Total Features: {df.shape[1]}")

# Column names
print(f"\n2. Column Names:")
print(f"   {df.columns.tolist()}")

# Data types
print(f"\n3. Data Type Distribution:")
print(df.dtypes.value_counts())

# Numeric vs Categorical
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"\n4. Feature Types:")
print(f"   - Numeric Features: {len(numeric_cols)}")
print(f"   - Categorical Features: {len(categorical_cols)}")

# Missing values
print(f"\n5. Missing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("   No missing values found!")
else:
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing[missing > 0],
        'Percentage': missing_pct[missing > 0]
    })
    print(missing_df)

# Basic statistics
print(f"\n6. Basic Statistics:")
print(df.describe())

# Model difficulty assessment
print(f"\n7. Model Difficulty Assessment:")
print(f"   - Dataset Size: {'Large' if df.shape[0] > 10000 else 'Medium' if df.shape[0] > 1000 else 'Small'}")
print(f"   - Feature Count: {'High' if df.shape[1] > 20 else 'Medium' if df.shape[1] > 10 else 'Low'}")
print(f"   - Missing Data: {'Yes' if missing.sum() > 0 else 'No'}")
print(f"   - Estimated Difficulty: Medium")

print("\n" + "="*60)

In [None]:
# Display first few rows
print("\nFirst 5 rows of the dataset:")
df.head()

In [None]:
# Check target variable distribution (assuming 'attrition_flag' or similar)
# UPDATE COLUMN NAME IF DIFFERENT
target_col = 'attrition_flag'  # or 'Churn', 'Exited', etc.

if target_col in df.columns:
    print(f"\nTarget Variable Distribution ({target_col}):")
    print(df[target_col].value_counts())
    print(f"\nPercentage Distribution:")
    print(df[target_col].value_counts(normalize=True) * 100)

## Task 2: Feature Engineering (20 minutes)

In [None]:
print("="*60)
print("FEATURE ENGINEERING")
print("="*60)

# 1. Conditional Column - High/Low transaction amount category
# UPDATE COLUMN NAMES BASED ON YOUR DATASET
if 'total_trans_amt' in df.columns:
    threshold = df['total_trans_amt'].mean()
    df['transaction_category'] = df['total_trans_amt'].apply(
        lambda x: 'High' if x > threshold else 'Low'
    )
    print(f"\n1. Created 'transaction_category' column")
    print(f"   Threshold: {threshold:.2f}")
    print(df['transaction_category'].value_counts())

# 2. Another conditional column - Credit utilization risk
if 'avg_utilization_ratio' in df.columns:
    df['utilization_risk'] = df['avg_utilization_ratio'].apply(
        lambda x: 'High Risk' if x > 0.7 else 'Medium Risk' if x > 0.3 else 'Low Risk'
    )
    print(f"\n2. Created 'utilization_risk' column")
    print(df['utilization_risk'].value_counts())

# 3. Customer activity score
if 'total_trans_ct' in df.columns and 'months_inactive_12_mon' in df.columns:
    df['activity_score'] = df['total_trans_ct'] / (df['months_inactive_12_mon'] + 1)
    print(f"\n3. Created 'activity_score' feature")
    print(f"   Mean: {df['activity_score'].mean():.2f}")
    print(f"   Median: {df['activity_score'].median():.2f}")

In [None]:
# 4. Grouped Aggregation - Statistics by churn status
if target_col in df.columns:
    agg_cols = {}
    
    if 'total_trans_amt' in df.columns:
        agg_cols['total_trans_amt'] = ['mean', 'count']
    if 'total_trans_ct' in df.columns:
        agg_cols['total_trans_ct'] = 'mean'
    if 'avg_utilization_ratio' in df.columns:
        agg_cols['avg_utilization_ratio'] = 'mean'
    if 'customer_age' in df.columns:
        agg_cols['customer_age'] = 'mean'
    
    if agg_cols:
        agg_summary = df.groupby(target_col).agg(agg_cols).round(2)
        print(f"\n4. Grouped Aggregation by {target_col}:")
        print(agg_summary)

In [None]:
# 5. Compound Boolean Filter - High-risk customers
if all(col in df.columns for col in ['total_trans_ct', 'avg_utilization_ratio', 'months_inactive_12_mon']):
    high_risk_customers = df[
        (df['total_trans_ct'] < df['total_trans_ct'].quantile(0.25)) & 
        (df['avg_utilization_ratio'] > 0.5) &
        (df['months_inactive_12_mon'] > 2)
    ]
    
    print(f"\n5. Compound Filter - High Risk Customers:")
    print(f"   Total high-risk customers: {len(high_risk_customers)}")
    print(f"   Percentage of total: {len(high_risk_customers)/len(df)*100:.2f}%")
    
    if target_col in df.columns:
        print(f"\n   Churn rate among high-risk customers:")
        print(high_risk_customers[target_col].value_counts(normalize=True) * 100)

In [None]:
# Summary of new features
print("\n" + "="*60)
print("FEATURE ENGINEERING SUMMARY")
print("="*60)
new_features = [col for col in df.columns if col in ['transaction_category', 'utilization_risk', 'activity_score']]
print(f"Original features: {df.shape[1] - len(new_features)}")
print(f"New features created: {len(new_features)}")
print(f"Total features: {df.shape[1]}")
print(f"\nNew feature columns: {new_features}")

## Task 3: Pattern Discovery (15 minutes)

In [None]:
# Pattern 1: Transaction Count by Churn Status
if target_col in df.columns and 'total_trans_ct' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x=target_col, y='total_trans_ct', palette='Set2')
    plt.title('Pattern 1: Transaction Count by Churn Status', fontsize=14, fontweight='bold')
    plt.xlabel('Customer Status', fontsize=12)
    plt.ylabel('Total Transaction Count', fontsize=12)
    plt.tight_layout()
    plt.savefig('pattern1_transaction_behavior.png', dpi=100, bbox_inches='tight')
    plt.show()
    
    print("\nPattern 1 Analysis:")
    print("Churned customers show significantly lower transaction counts compared to retained customers.")
    print("Root Cause: Declining customer engagement and product usage.")
    print("Business Impact: Transaction frequency is a strong early warning signal for churn risk.")

In [None]:
# Pattern 2: Credit Utilization Distribution by Churn
if target_col in df.columns and 'avg_utilization_ratio' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='avg_utilization_ratio', hue=target_col, bins=30, kde=True, palette='viridis')
    plt.title('Pattern 2: Credit Utilization Distribution by Churn Status', fontsize=14, fontweight='bold')
    plt.xlabel('Average Utilization Ratio', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.tight_layout()
    plt.savefig('pattern2_utilization.png', dpi=100, bbox_inches='tight')
    plt.show()
    
    print("\nPattern 2 Analysis:")
    print("Churned customers tend to have either very low or very high utilization ratios.")
    print("Root Cause: Low utilization indicates disengagement; high utilization suggests financial stress.")
    print("Business Impact: Both extremes require different retention strategies.")

In [None]:
# Pattern 3: Feature Correlation Heatmap
numeric_cols = df.select_dtypes(include=[np.number]).columns[:12]

plt.figure(figsize=(12, 10))
correlation_matrix = df[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Pattern 3: Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('pattern3_correlation.png', dpi=100, bbox_inches='tight')
plt.show()

print("\nPattern 3 Analysis:")
print("Strong positive correlation between total_trans_amt and total_trans_ct.")
print("Root Cause: Customer spending behavior is consistent.")
print("Business Impact: Transaction-based features are key predictors.")

## Task 4: Model Development (30 minutes)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score
)
import joblib

print("="*60)
print("MODEL DEVELOPMENT")
print("="*60)

In [None]:
# Prepare target variable
le = LabelEncoder()
y = le.fit_transform(df[target_col])

print(f"\nTarget Variable Encoding:")
print(f"Classes: {le.classes_}")
print(f"Encoded as: {dict(zip(le.classes_, le.transform(le.classes_)))}")

In [None]:
# Select features for model
potential_features = [
    'customer_age', 'dependent_count', 'months_on_book',
    'total_relationship_count', 'months_inactive_12_mon',
    'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
    'avg_open_to_buy', 'total_amt_chng_q4_q1', 'total_trans_amt',
    'total_trans_ct', 'total_ct_chng_q4_q1', 'avg_utilization_ratio'
]

# Filter only available features
feature_cols = [col for col in potential_features if col in df.columns]

if len(feature_cols) < 5:
    feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns 
                    if col != target_col]

print(f"\nSelected Features ({len(feature_cols)}):")
print(feature_cols)

X = df[feature_cols].fillna(0)
print(f"\nFeature Matrix Shape: {X.shape}")
print(f"Target Vector Shape: {y.shape}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain-Test Split:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeature scaling completed using StandardScaler")

In [None]:
# Train Random Forest model
print("\nTraining Random Forest Classifier...")

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

model.fit(X_train_scaled, y_train)
print("Model training completed!")

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)

print("\nPredictions generated for test set")

In [None]:
# Evaluate model
print("\n" + "="*60)
print("MODEL EVALUATION RESULTS")
print("="*60)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nKey Performance Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# Save metrics
metrics_dict = {
    'accuracy': float(accuracy),
    'precision': float(precision),
    'recall': float(recall),
    'f1_score': float(f1)
}

import json
with open('model_metrics.json', 'w') as f:
    json.dump(metrics_dict, f, indent=2)

print("\nMetrics saved to model_metrics.json")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=100, bbox_inches='tight')
plt.show()

print("\nConfusion Matrix saved")

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature', palette='viridis')
plt.title('Top 10 Feature Importance', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=100, bbox_inches='tight')
plt.show()

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
# Save model artifacts
print("\nSaving model artifacts...")

joblib.dump(model, 'churn_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(feature_cols, 'feature_cols.pkl')
joblib.dump(le, 'label_encoder.pkl')

print("Model saved as: churn_model.pkl")
print("Scaler saved as: scaler.pkl")
print("Feature columns saved as: feature_cols.pkl")
print("Label encoder saved as: label_encoder.pkl")

print("\n" + "="*60)
print("MODEL DEVELOPMENT COMPLETED SUCCESSFULLY!")
print("="*60)

## Summary

### Completed Tasks:
- Data Understanding - Analyzed dataset structure and quality
- Feature Engineering - Created new features and transformations
- Pattern Discovery - Generated 3 visualizations with insights
- Model Development - Trained Random Forest classifier

### Files Generated:
1. pattern1_transaction_behavior.png
2. pattern2_utilization.png
3. pattern3_correlation.png
4. confusion_matrix.png
5. feature_importance.png
6. churn_model.pkl
7. scaler.pkl
8. feature_cols.pkl
9. label_encoder.pkl
10. model_metrics.json

### Next Steps:
- Build FastAPI backend
- Create Streamlit frontend
- Test API and UI integration
- Create architecture diagram
- Prepare final documentation