In [1]:
import warnings
from datetime import datetime
import os

import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import ParameterSampler, cross_val_score, train_test_split
from xgboost import XGBClassifier
from joblib import dump, load

warnings.filterwarnings("ignore")

Reading the batch file from experian

In [2]:
# Define path to the Experian batch request file
file_path = (
    r"C:\Users\payju\Downloads\PJN_notebook\2024\adhoc\Mario\HC approval\input\PayJustNow_Experian_Batch_request_2025_05_09_Target_20250514_20836.txt"
)

# Read pipe-separated values into a DataFrame
# utf-8 encoding ensures proper handling of special characters
# low_memory=False helps infer column types more accurately
df = pd.read_csv(file_path, sep="|", encoding="utf-8", low_memory=False)

# Rename columns for clarity and consistency
df.rename(
    columns={
        'VAP_PrismScore_TM': 'prism_score',
        'ID_Number': 'identity_number'
    },
    inplace=True
)
# Convert identity numbers to string type to preserve leading zeros
df['identity_number'] = df['identity_number'].astype(str)
# Ensure all identity numbers are 13 characters long by adding leading zeros
df['identity_number'] = df['identity_number'].str.zfill(13)
# Display the first few rows to verify changes
df.head()

Unnamed: 0,identity_number,prism_score
0,8180791082,583.0
1,9008100867088,631.0
2,209065530088,587.0
3,8005300847085,598.0
4,8912181188085,590.0


In [3]:
def calculate_age(id_no: str) -> int | None:
    """
    Derive age from a South African ID number.

    Args:
        id_no (str): 13-digit ZA ID string (YYMMDDxxxxxxx).

    Returns:
        int | None: Age in years if valid; otherwise None.
    """
    s = str(id_no).zfill(13)
    if len(s) != 13 or not s.isdigit():
        return None

    yy, mm, dd = map(int, (s[:2], s[2:4], s[4:6]))
    year = (2000 if yy <= datetime.today().year % 100 else 1900) + yy
    today = datetime.today()

    try:
        birth = datetime(year, mm, dd)
        age = (
            today.year - birth.year
            - ((today.month, today.day) < (birth.month, birth.day))
        )
    except ValueError:
        age = today.year - year

    return age


def predict_approval_probabilities(model: XGBClassifier, data: pd.DataFrame) -> pd.DataFrame:
    """
    Generate approval probability scores for new applicants.

    Args:
        model (XGBClassifier): Trained XGBoost classifier.
        data (pd.DataFrame): New dataset with required columns.

    Returns:
        pd.DataFrame: Contains identity_number, features, and approval_probability.

    Raises:
        ValueError: If any of 'prism_score', 'age', or 'prism_band_encoded' is missing.
    """
    required = ['prism_score', 'age', 'prism_band_encoded']
    missing = [col for col in required if col not in data.columns]
    if missing:
        raise ValueError(f"Missing columns: {', '.join(missing)}")

    probs = model.predict_proba(data[required])[:, 1]
    out = data.copy()
    out['approval_probability'] = probs

    return out[['identity_number', *required, 'approval_probability']]


def save_approved_users(data: pd.DataFrame, threshold: float, output_dir: str) -> str:
    """
    Filter and save approved users to CSV.

    Args:
        data (pd.DataFrame): DataFrame with 'approval_probability'.
        threshold (float): Minimum probability to mark as approved.
        output_dir (str): Directory path to save output CSV.

    Returns:
        str: File path of the saved CSV.

    Raises:
        OSError: If the directory cannot be created.
    """
    approved = data[data['approval_probability'] > threshold].copy()
    approved['approved'] = 1
    os.makedirs(output_dir, exist_ok=True)
    today = datetime.now().strftime('%Y-%m-%d')
    file_path = os.path.join(output_dir, f"HC_approved_users_{today}.csv")
    approved[['identity_number', 'prism_score', 'approved']].to_csv(file_path, index=False)
    return file_path


def main():
    # Load DataFrame 'df' from previous processing steps
    # Assumes 'df' already loaded with 'identity_number' and 'prism_score'

    # Load trained model
    model = load(r"model\best_model.joblib")

    # Preprocess IDs and compute age
    valid_mask = df['identity_number'].astype(str).str.fullmatch(r"\d{13}")
    data = df[valid_mask].copy()
    data['age'] = data['identity_number'].apply(calculate_age)

    # Encode prism score bands
    bins = [-np.inf, 4, *range(560, 721, 10), np.inf]
    labels = ['Thin files', '<=560'] + [f"{i+1}–{i+10}" for i in range(560, 720, 10)] + ['721+']
    band_map = {band: idx for idx, band in enumerate(labels)}
    data['prism_score_band'] = pd.cut(data['prism_score'], bins=bins, labels=labels, include_lowest=True)
    data['prism_band_encoded'] = data['prism_score_band'].map(band_map)
    data = data.dropna(subset=['prism_band_encoded'])

    # Predict and save
    predictions = predict_approval_probabilities(model, data)
    output_dir = r"\output_data"
    path = save_approved_users(predictions, threshold=0.6, output_dir=output_dir)
    print(f"Saved approved users to: {path}")

    # Show first 20 approved users
    approved = predictions[predictions['approval_probability'] > 0.6].copy()
    approved['approved'] = 1
    print(approved[['identity_number', 'prism_score', 'approved']].head(20))


In [4]:
if __name__ == "__main__":
    main()

Saved approved users to: \output_data\HC_approved_users_2025-05-16.csv
   identity_number  prism_score  approved
1    9008100867088        631.0         1
3    8005300847085        598.0         1
4    8912181188085        590.0         1
5    6112250539084        605.0         1
10   8807150413086        592.0         1
11   9912125911088        594.0         1
13   9002121307083        619.0         1
14   8301235394081        617.0         1
19   8308120939086        587.0         1
22   9007061095085        591.0         1
23   7309121066086        619.0         1
25   9901110832086        657.0         1
26   9605271376086        590.0         1
29   9808145688088        628.0         1
31   6806250890088        609.0         1
32   8302200461087        619.0         1
33   9301015905089        597.0         1
35   6405010319080        609.0         1
36   5901130753084        605.0         1
37   7303140681082        596.0         1
