# Pipeline for Credit card Prediction

## Data Load & Understanding

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
import joblib

import os, sys
from pathlib import Path
from IPython.display import display, Image 
sys.path.append("../../scripts")


from preprocessor import *
from predict_risk import *
from data_processing_framework import *


In [None]:
path = "rikdifos/credit-card-approval-prediction"
result = fetch_kaggle_dataset_by_path(path, temp_dir=".temp_kaggle")

📦 Fetching Kaggle dataset: rikdifos/credit-card-approval-prediction
Dataset URL: https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction


Dropdown(description='Select File:', options=('application_record.csv', 'credit_record.csv'), value='applicati…

Button(description='📥 Load Selected File', style=ButtonStyle())

Output()

🔄 Select a file to load individually. You can rerun to load others.


In [3]:
# load dataset 1
appl_df = raw_df
#appl_df = pd.read_csv("appl.csv")
display(appl_df.head(3))

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0


In [3]:
# load dataset 2
#crdt_df = raw_df
crdt_df = pd.read_csv("crdt.csv")
display(crdt_df.head(3))

Unnamed: 0.1,Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,0,5001711,0,X
1,1,5001711,-1,0
2,2,5001711,-2,0


###  Dataset Overview

appl_df — Application Data (per customer): Contains:
- Demographics and application info
- Key columns: ID, CODE_GENDER, AMT_INCOME_TOTAL, DAYS_BIRTH, DAYS_EMPLOYED, etc.

crdt_df — Credit History Data (per record/month): Contains:
- Credit status over time for each ID
- Columns: ID, MONTHS_BALANCE, STATUS (e.g., '0', '1', 'C', 'X')

## Project Goal

1. Merge both datasets
2. Create a target: is_high_risk (from latest STATUS per ID)
3. Preprocesses application data
4. Build and train a pipeline model
5. Save the pipeline and model
6. Make predictions on raw application forms

## Data Preparation + Target Creation.

### Step 1: Map credit STATUS codes to risk levels

In [None]:
# Step 1.1: Map credit STATUS codes to risk levels
status_severity = {
    '5': 7,  # Overdue >150 days
    '4': 6,
    '3': 5,
    '2': 4,
    '1': 3,
    '0': 2,  # Some delay
    'C': 1,  # Closed
    'X': 0   # No loan
}

# Step 1.2: Get the most recent status per ID
latest_status = crdt_df.sort_values(['ID', 'MONTHS_BALANCE'], ascending=[True, False])
latest_status = latest_status.drop_duplicates(subset=['ID'], keep='first')

# Step 1.3: Define is_high_risk = 1 if most recent status in ['1','2','3','4','5']
latest_status['is_high_risk'] = latest_status['STATUS'].isin(['1', '2', '3', '4', '5']).astype(int)

# Step 1.4: Create account age (in months)
account_age = crdt_df.groupby('ID')['MONTHS_BALANCE'].min().abs()
latest_status['account_age_months'] = latest_status['ID'].map(account_age)

# Final target table
target_df = latest_status[['ID', 'is_high_risk', 'account_age_months']]

### Step 2: Merge target into application data

In [None]:
# Step 2: Merge target into application data
# Merge target with application data
merged_df = appl_df.merge(target_df, on='ID', how='inner')

# Drop columns we won’t use
merged_df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Preview
print(merged_df.shape)
merged_df.head()

## Feature Selection

### Ensure binary target and Select Features using CHI_SQUARE and ANOVA Tests

In [None]:
# Ensure binary target
merged_df['is_high_risk'] = merged_df['is_high_risk'].astype(int)

# Step 1: Detect column types
detected = detect_columns(merged_df)
display(detected)

# Extract column groups
config = {
    "binary": detected.loc[detected["Category"] == "Binary", "Columns"].values[0],
    "multi_cat": detected.loc[detected["Category"] == "Multi-Category", "Columns"].values[0],
    "numeric": detected.loc[detected["Category"] == "Numerical", "Columns"].values[0],
}

# Step 2: Chi-square test for categorical (binary + multi)
cat_cols = config["binary"] + config["multi_cat"]
chi_results = chi_square_test_batch(merged_df, cat_cols, target_column="is_high_risk", plot=False)
significant_cat = chi_results[chi_results["significant"]]["feature"].tolist()

# Step 3: ANOVA for numerical columns
anova_results = anova_test_numerical_features(merged_df, target_column="is_high_risk", plot=False)
significant_num = anova_results[anova_results["significant"]]["feature"].tolist()

# Combine selected features
selected_features = significant_cat + significant_num
print("✅ Selected features:", selected_features)

In [None]:
#importance_df = merged_df.copy()
importance_df = merged_df.copy().drop(columns=['ID', 'OCCUPATION_TYPE', 'account_age_months'])
importance_df = compute_combined_feature_importance(
    importance_df,
    target_column='is_high_risk',
    top_n=10,
    plot=True
)

#print(importance_df)

### Include Domain Knowledge Features with Significant Features

Since account_age_months is derived from the same credit history used to generate is_high_risk, using it as a feature would leak information from the target generation process. **Remove account_age_months from the training features to prevent label leakage and potential overfitting and FLAG_MOBILE (only 1 unique value)**

✅ Best practice: Add domain-informed features that are: Easy to compute, Not derived from the credit history (i.e., safe to use), Known to influence credit risk

Recommended Additional Features:
[
    'AMT_INCOME_TOTAL',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'NAME_INCOME_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_EDUCATION_TYPE',
    'CNT_CHILDREN',
     'FLAG_OWN_CAR', 
    'FLAG_PHONE',
    'FLAG_OWN_REALTY'
]

In [None]:
domain_features = [ 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS', 'NAME_EDUCATION_TYPE', 'CNT_CHILDREN', 'FLAG_OWN_CAR', 'FLAG_PHONE', 'FLAG_OWN_REALTY' ]
features = selected_features + domain_features

df = merged_df[features + ['is_high_risk']].copy()
df.drop(columns=['account_age_months'], inplace=True)
df.head()


## Data Preprocessing

### Initial EDA / Data Quality Checks

In [None]:
# EDA report using ydata_profiling
profile = ProfileReport(df, title="Credit Card Approval EDA", explorative=True)
#profile.to_file("reports_html\Credit_Card_Approval_eda_report.html")
profile.to_notebook_iframe()

📊 Profile-Driven Cleaning Required:
| Column                        | Issue                | Action                |
| ----------------------------- | -------------------- | --------------------- |
| `CNT_CHILDREN`                | Outliers, high corr. | Cap, keep             |
| `AMT_INCOME_TOTAL`            | Skewed, outliers     | Cube root + cap       |
| `DAYS_BIRTH`, `DAYS_EMPLOYED` | Skewed               | Convert to years, cap |
| `NAME_*` features             | Rare categories      | Group "Other"     
|`is_high_risk`  |Imbalanced target|Use class weighting or SMOTE|     |
   |


### Clean the raw data

In [None]:
# Clean the raw data
def clean_credit_data(df):
    df = df.copy()

    # Convert DAYS_BIRTH and DAYS_EMPLOYED to years
    df['AGE_YEARS'] = (-df['DAYS_BIRTH'] / 365).round(1)
    df['EMPLOYMENT_YEARS'] = df['DAYS_EMPLOYED'].apply(lambda x: 0 if x > 0 else round(abs(x) / 365, 1))
    df.drop(columns=['DAYS_BIRTH', 'DAYS_EMPLOYED'], inplace=True)

    # Cap outliers (based on profile insight)
    df = cap_outliers(df, "CNT_CHILDREN")

    # Apply cube root to skewed numerical columns
    df = transform_skewed_columns(df, ['AMT_INCOME_TOTAL', 'AGE_YEARS' ,'EMPLOYMENT_YEARS'], method='cbrt')

    # Group rare categories into 'Other' for high-cardinality categoricals
    rare_threshold = 0.01 * len(df)
    for col in ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE']:
        freq = df[col].value_counts()
        rare_vals = freq[freq < rare_threshold].index
        df[col] = df[col].replace(rare_vals, 'Other')

    return df

## Build PreprocessingPipeline using ColumnTransformer

### Define Feature Columns & Select Models for Training

In [None]:
# Define feature columns
binary_features = ['CODE_GENDER', 'FLAG_EMAIL']
categorical_features = ['NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE']
numeric_features = ['AMT_INCOME_TOTAL', 'AGE_YEARS', 'EMPLOYMENT_YEARS', 'CNT_CHILDREN']
all_features = binary_features + categorical_features + numeric_features

selected_models = [
    "Logistic Regression",     #  Baseline + interpretable
    "Random Forest",           #  Robust, handles nonlinearity
    "XGBoost",                 #  Highly performant, handles imbalance
    "LightGBM",                #  Fast and scalable, similar to XGBoost
    "CatBoost",                #  Categorical-friendly, no OHE needed
    "Gradient Boosting"        #  Solid traditional boosting
]

### Define Train Pipeline Function

> Since the objective of this problem is to minimize the risk of credit default for the financial institution, there is always a trade-off between precision and recall. Choosing the right metrics depends on the problem being solved. In our case, we will use recall as our metric with a fall back to F1_Score if tied.

In [None]:
def train_and_save_best_pipeline(df: pd.DataFrame, features, binary_features, categorical_features, numeric_features, selected_models, target: str = 'is_high_risk'):

    print("🔄 Cleaning and preprocessing input data...")
    df_cleaned = clean_credit_data(df)

    print("✅ Data cleaning complete. Preparing features and target...")
    
    X = df_cleaned[features]
    y = df_cleaned[target]

    preprocessor = ColumnTransformer([
        ('bin', OrdinalEncoder(), binary_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('num', MinMaxScaler(), numeric_features)
    ])

    print("\n=== Splitting data and applying preprocessing...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
    X_train_prep = pd.DataFrame(
            preprocessor.fit_transform(X_train),
            columns=preprocessor.get_feature_names_out(),
            index=X_train.index
        )
    X_test_prep = pd.DataFrame(
            preprocessor.transform(X_test),
            columns=preprocessor.get_feature_names_out(),
            index=X_test.index
        )

    #smote = SMOTE(random_state=42)
    print("\n=== Applying SMOTE to balance training data...")
   # X_train_bal, y_train_bal = smote.fit_resample(X_train_prep, y_train)
    X_train_bal, y_train_bal = balance_classes_smote(X_train_prep, y_train)
    print(y_train_bal.value_counts())

    print("\n=== Training and evaluating models...")
    results, trained_models = train_and_evaluate_models(X_train_bal, y_train_bal, selected_models=selected_models)

    print("\n=== Model training complete. Selecting best model using Recall...")
    results_df = results_to_dataframe(results).sort_values("Recall (1)", ascending=False)
    best_model_name = results_df.sort_values("F1-score (1)", ascending=False).iloc[0]["Model"]
    best_model = trained_models[best_model_name]

    print("\n=== Model training complete. Selecting best model...")
    results_df = results_to_dataframe(results).sort_values("Recall (1)", ascending=False)
    top_recall = results_df.iloc[0]['Recall (1)']
    top_models = results_df[results_df['Recall (1)'] == top_recall]
    top_models = top_models.sort_values("F1-score (1)", ascending=False)
    best_model_name = top_models.iloc[0]['Model']  # pick highest recall, break ties by F1-score  # still pick the first as tie breaker
    best_model = trained_models[best_model_name]


    print("\n=== Saving best model...")
    joblib.dump(preprocessor, "cc_risk_preprocessor.joblib")
    model_path = f"cc_risk_model_{best_model_name.lower().replace(' ', '_')}.joblib"
    joblib.dump(best_model, model_path)
    with open("cc_risk_model_path.txt", "w") as f:
        f.write(model_path)

    print("\n=== Preprocessor and best model saved.")
    print(f"🏆 Best model: {best_model_name}")
    print(f"📦 Model path: {model_path}")
    print("\n*** Top models by Recall (1):")
    print(top_models[['Model', 'Recall (1)', 'F1-score (1)']].to_string(index=False))
    print("\n*** All model's output")
    display(results_df)
    
    return results, best_model_name

### Train & Save Best Model

In [None]:
results_dict, best_model_name = train_and_save_best_pipeline(df, all_features, binary_features, categorical_features, numeric_features, selected_models, 'is_high_risk')   


### Read best model result to generate insights

In [None]:
print("Available model keys:", list(results_dict.keys()))
print("Best model selected:", best_model_name)


In [None]:
# Load the best saved risk model
with open("cc_risk_model_path.txt", "r") as f:
    model_path = f.read().strip()
    best_model = joblib.load(model_path)

# Extract confusion matrix
conf_matrix = results_dict[best_model_name]["confusion_matrix"]
tn, fp, fn, tp = conf_matrix.ravel() if hasattr(conf_matrix, "ravel") else sum(conf_matrix, [])

## Business Findings and Recommendations

---
- **Typical profile of an applicant is:**
> *a Female in her early 40s, married with a partner and no child. She has been employed for 5 years with a salary of 157,500. She has completed her secondary education. She does not own a car but owns a property (a house/apartment). Her account is 26 months old.*
---

In [None]:
print(f"\n🏆 Best Model: {best_model_name}\n")
print(f"""
📊 Actionable Insights (Credit Card Approval Risk Prediction)

✅ Approve low-risk customers confidently (TN): {tn}
   → Correctly predicted as low-risk. Proceed with approval.

🚫 Block high-risk applications effectively (TP): {tp}
   → Correctly flagged as high-risk. Decline or refer for manual review.

⚠️ Missed high-risk approvals (FN): {fn}
   → High-risk customers approved. Investigate features or retrain to reduce false negatives.

📉 Unnecessary rejections (FP): {fp}
   → Low-risk applicants incorrectly flagged. Consider tuning threshold or feature set.
""")

 ## Predict New Data Using the Best Model

### Define predict_on_raw Function

In [None]:
def predict_on_raw(raw_df: pd.DataFrame):
    print("\n=== Loading saved model and preprocessor...")
    preprocessor = joblib.load("cc_risk_preprocessor.joblib")
    with open("cc_risk_model_path.txt", "r") as f:
        model_path = f.read().strip()
    model = joblib.load(model_path)

    print("\n=== Cleaning and transforming raw input...")
    df_cleaned = clean_credit_data(raw_df)

    # Dynamically infer all features except target
    features = df_cleaned.columns.difference(['is_high_risk']).tolist()
    X_input = df_cleaned[features]

    X_prepped = pd.DataFrame(
        preprocessor.transform(X_input),
        columns=preprocessor.get_feature_names_out(),
        index=X_input.index
    )

    print("\n🔮 Making predictions...")
    predictions = model.predict(X_prepped)
    probabilities = model.predict_proba(X_prepped)[:, 1]

    # ✅ Labeled and formatted output
    result = pd.DataFrame({
        "prediction_label": predictions,
        "risk_probability (%)": (probabilities * 100).round(2)
    })
    result["risk_category"] = result["prediction_label"].map({0: "Low Risk", 1: "High Risk"})

    return result


In [None]:
def predict_on_raw(raw_df: pd.DataFrame):
    
    print("\n📥 Loading saved model and preprocessor...")
    preprocessor = joblib.load("cc_risk_preprocessor.joblib")
    with open("cc_risk_model_path.txt", "r") as f:
        model_path = f.read().strip()
    model = joblib.load(model_path)  # or set dynamically based on what's saved

    print("\n🧼 Cleaning and transforming raw input...")
    df_cleaned = clean_credit_data(raw_df)
    
    # Dynamically infer all features except target
    features = df_cleaned.columns.difference(['is_high_risk']).tolist()
    X_input = df_cleaned[features]

    X_input = df_cleaned[features]
    X_prepped = pd.DataFrame(
        preprocessor.transform(X_input),
        columns=preprocessor.get_feature_names_out(),
        index=X_input.index
    )

    print("\n🔮 Making predictions...")
    predictions = model.predict(X_prepped)
    probabilities = model.predict_proba(X_prepped)[:, 1]
    print("\nPredicted probabilities:", probabilities.round(4))

    # Apply custom threshold
    threshold = 0.9 # You can tune this
    predictions = (probabilities >= threshold).astype(int)

    result = pd.DataFrame({
        "prediction_label": predictions,
        "risk_probability (%)": (probabilities * 100).round(2)
    })
    result["risk_category"] = result["prediction_label"].map({0: "Low Risk", 1: "High Risk"})
    return result

### Import raw data or read from file /df

In [None]:
raw_data = pd.DataFrame([{
    "CODE_GENDER": "M",
    "FLAG_EMAIL": 0,
    "NAME_INCOME_TYPE": "Working",
    "NAME_FAMILY_STATUS": "Single",
    "NAME_EDUCATION_TYPE": "Higher education",
    "NAME_HOUSING_TYPE": "House / apartment",
    "AMT_INCOME_TOTAL": 120000,
    "DAYS_BIRTH": -12000,
    "DAYS_EMPLOYED": -3000,
    "CNT_CHILDREN": 5
}])


mraw_data = pd.DataFrame([
    {
        "CODE_GENDER": "M",
        "FLAG_EMAIL": 0,
        "NAME_INCOME_TYPE": "Working",
        "NAME_FAMILY_STATUS": "Single",
        "NAME_EDUCATION_TYPE": "Higher education",
        "NAME_HOUSING_TYPE": "House / apartment",
        "AMT_INCOME_TOTAL": 120000,
        "DAYS_BIRTH": -12000,
        "DAYS_EMPLOYED": -3000,
        "CNT_CHILDREN": 5
    },
    {
        "CODE_GENDER": "F",
        "FLAG_EMAIL": 1,
        "NAME_INCOME_TYPE": "Pensioner",
        "NAME_FAMILY_STATUS": "Widow",
        "NAME_EDUCATION_TYPE": "Secondary / secondary special",
        "NAME_HOUSING_TYPE": "Municipal apartment",
        "AMT_INCOME_TOTAL": 80000,
        "DAYS_BIRTH": -24000,
        "DAYS_EMPLOYED": -10000,
        "CNT_CHILDREN": 0
    },
    {
        "CODE_GENDER": "M",
        "FLAG_EMAIL": 1,
        "NAME_INCOME_TYPE": "Student",
        "NAME_FAMILY_STATUS": "Single",
        "NAME_EDUCATION_TYPE": "Incomplete higher",
        "NAME_HOUSING_TYPE": "With parents",
        "AMT_INCOME_TOTAL": 30000,
        "DAYS_BIRTH": -9000,
        "DAYS_EMPLOYED": -200,
        "CNT_CHILDREN": 0
    },
    {
        "CODE_GENDER": "F",
        "FLAG_EMAIL": 0,
        "NAME_INCOME_TYPE": "Commercial associate",
        "NAME_FAMILY_STATUS": "Married",
        "NAME_EDUCATION_TYPE": "Higher education",
        "NAME_HOUSING_TYPE": "Rented apartment",
        "AMT_INCOME_TOTAL": 200000,
        "DAYS_BIRTH": -18000,
        "DAYS_EMPLOYED": -4000,
        "CNT_CHILDREN": 2
    },
    {
        "CODE_GENDER": "F",
        "FLAG_EMAIL": 1,
        "NAME_INCOME_TYPE": "Student",
        "NAME_FAMILY_STATUS": "Single",
        "NAME_EDUCATION_TYPE": "Secondary / secondary special",
        "NAME_HOUSING_TYPE": "With parents",
        "AMT_INCOME_TOTAL": 0,
        "DAYS_BIRTH": -5000,
        "DAYS_EMPLOYED": -0,
        "CNT_CHILDREN": 0
    }
])



### Predict is_high_risk

In [None]:
result_df = predict_on_raw(raw_data)
display(result_df)

In [None]:
result1_df = predict_on_raw(mraw_data)
display(result1_df)