In [17]:
# Import necessary library
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter, MaxNLocator
from IPython.display import Markdown, display

In [18]:
# Load CSV file from your local path
file_path = r"C:\Users\danie\Downloads\Git\cr_loan2.csv"  # Use raw string (r"") to avoid path errors
cr_loan = pd.read_csv(file_path)

In [19]:
# Check the structure of the data
print(cr_loan.dtypes)

# Display first few rows
display(cr_loan.head())

person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_status                     int64
loan_percent_income           float64
cb_person_default_on_file      object
cb_person_cred_hist_length      int64
dtype: object


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [20]:
# Creating an expanded, UK-based, bootstrapped dataset (~500k rows)
# - Reads the uploaded file at /mnt/data/cr_loan2.csv
# - Bootstraps (resamples with replacement) up to 500,000 rows
# - Preserves original distributions where possible, adds small noise to numeric columns for realism
# - Detects an interest-rate-like column and remaps percentiles to Lloyds personal loan APR range (6.4% - 14.9%)
# - Adds UK-specific synthetic columns: country, region, city
# - Clips numeric columns to reasonable UK-realistic bounds where applicable and saves output CSV
# - Shows a preview and basic summaries

import pandas as pd
import numpy as np
import os

RANDOM_STATE = 42
INPUT_PATH = r"C:\Users\danie\Downloads\Git\cr_loan2.csv" 
OUTPUT_PATH = r"C:\Users\danie\Downloads\Git\credit_loan_generation.csv" 
TARGET_N = 500_000

# Read file
df = pd.read_csv(INPUT_PATH)
orig_n = len(df)
print(f"Original rows: {orig_n:,}  |  Target rows: {TARGET_N:,}")

# Bootstrap sample to reach ~500k rows (with replacement)
np.random.seed(RANDOM_STATE)
if orig_n == 0:
    raise ValueError("Input file seems empty. Please upload a valid CSV with data.")

sampled = df.sample(n=TARGET_N, replace=True, random_state=RANDOM_STATE).reset_index(drop=True)

# Add small gaussian noise to numeric columns for realism (without breaking categorical identity)
num_cols = sampled.select_dtypes(include=[np.number]).columns.tolist()

# Avoid touching id-like columns if present (heuristic: 'id' in name)
num_cols = [c for c in num_cols if 'id' not in c.lower()]

# ⚠️ FIX START: Exclude loan_status column from noise injection
if 'loan_status' in num_cols:
    num_cols.remove('loan_status')
# ⚠️ FIX END

for c in num_cols:
    col = sampled[c]
    std = col.std(skipna=True)
    if pd.isna(std) or std == 0:
        continue
    noise_scale = std * 0.01  # 1% of original std
    sampled[c] = col + np.random.normal(loc=0, scale=noise_scale, size=len(sampled))

# ⚠️ FIX ADDITION: Enforce loan_status as binary 0/1 integers
if 'loan_status' in sampled.columns:
    sampled['loan_status'] = sampled['loan_status'].round().clip(0, 1).astype(int)
# ⚠️ FIX END

# Detect interest-rate-like column
interest_col = None
candidates = [c for c in sampled.columns if any(k in c.lower() for k in ['interest','int_rate','apr','rate'])]
if len(candidates) == 1:
    interest_col = candidates[0]
elif len(candidates) > 1:
    for cand in candidates:
        if 'interest' in cand.lower() and 'rate' in cand.lower():
            interest_col = cand
            break
    interest_col = interest_col or candidates[0]

if interest_col is None:
    print("No obvious interest-rate column found. Creating synthetic 'interest_rate' from loan amount or uniform distribution.")
    if any(k for k in sampled.columns if 'loan' in k.lower() and sampled[k].dtype.kind in 'biufc'):
        loan_col = [k for k in sampled.columns if 'loan' in k.lower() and sampled[k].dtype.kind in 'biufc'][0]
        loan_amt = sampled[loan_col].fillna(sampled[loan_col].median())
        pct = (loan_amt.rank(method='average', pct=True)).values
        sampled['interest_rate'] = 6.4 + pct * (14.9 - 6.4) + np.random.normal(0, 0.2, size=len(sampled))
        interest_col = 'interest_rate'
    else:
        sampled['interest_rate'] = np.random.uniform(6.4, 14.9, size=len(sampled))
        interest_col = 'interest_rate'

print(f"Using interest column: {interest_col}")

# Map interest ranks to Lloyds APR band (6.4% - 14.9%)
ranks = sampled[interest_col].rank(method='average', pct=True).values
sampled[interest_col] = 6.4 + ranks * (14.9 - 6.4) + np.random.normal(0, 0.15, size=len(sampled))
sampled[interest_col] = sampled[interest_col].clip(6.4, 14.9).round(3)

# Add UK-specific fields: country, region, city
regions_cities = {
    "England": ["London", "Manchester", "Birmingham", "Leeds", "Liverpool", "Bristol", "Newcastle", "Sheffield", "Southampton", "Nottingham"],
    "Scotland": ["Edinburgh", "Glasgow", "Aberdeen", "Dundee", "Inverness"],
    "Wales": ["Cardiff", "Swansea", "Newport", "Wrexham"],
    "Northern Ireland": ["Belfast", "Derry", "Lisburn", "Newry"]
}

regions = list(regions_cities.keys())
region_weights = np.array([0.84, 0.08, 0.05, 0.03])
region_choices = np.random.choice(regions, size=TARGET_N, p=region_weights)

cities = [np.random.choice(regions_cities[r]) for r in region_choices]
sampled['country'] = "United Kingdom"
sampled['region'] = region_choices
sampled['city'] = cities

# === ADD NEW SYNTHETIC DEMOGRAPHICS ===
sampled['gender'] = np.random.choice(['Male', 'Female'], size=TARGET_N, p=[0.49, 0.51]) # Gender distribution
sampled['marriage_status'] = np.random.choice([0, 1], size=TARGET_N, p=[0.45, 0.55])  # 0=single, 1=married

# === CLIP / SANITIZE NUMERIC COLUMNS ===
if 'person_age' in sampled.columns:
    sampled['person_age'] = sampled['person_age'].round().clip(18, 85)

# === NEW SECTION: HARD-CODED EMPLOYMENT LENGTH DISTRIBUTION ===
if 'person_emp_length' in sampled.columns:
    # Create piecewise distribution based on specified proportions
    n = len(sampled)
    emp_length = np.zeros(n)

    # 50% <15 years
    idx_15 = int(0.50 * n)
    emp_length[:idx_15] = np.random.uniform(0, 15, idx_15)

    # Next 25% (15–30)
    idx_30 = int(0.75 * n)
    emp_length[idx_15:idx_30] = np.random.uniform(15, 30, idx_30 - idx_15)

    # Next 24% (30–45)
    idx_45 = int(0.99 * n)
    emp_length[idx_30:idx_45] = np.random.uniform(30, 45, idx_45 - idx_30)

    # Top 1% (45–60)
    emp_length[idx_45:] = np.random.uniform(45, 60, n - idx_45)

    np.random.shuffle(emp_length)

    # Add missing (NaN) values (~7%)
    nan_mask = np.random.rand(n) < 0.07
    emp_length[nan_mask] = np.nan

    # Clip and round
    emp_length = np.round(np.clip(emp_length, 0, 60), 1)

    # Assign
    sampled['person_emp_length'] = emp_length

    # Ensure not exceeding age - 18 for realism
    if 'person_age' in sampled.columns:
        max_possible_exp = (sampled['person_age'] - 18).clip(lower=0)
        sampled.loc[~sampled['person_emp_length'].isna(), 'person_emp_length'] = np.minimum(
            sampled.loc[~sampled['person_emp_length'].isna(), 'person_emp_length'], 
            max_possible_exp.loc[~sampled['person_emp_length'].isna()]
        )

    # Display distribution diagnostics
    desc = sampled['person_emp_length'].describe(percentiles=[0.5, 0.75, 0.99])
    print("\n-- Employment Length Distribution Summary --")
    print(desc)

# Create a simple interest band column aligned to Lloyds-like bands
bins = [0, 7.5, 9.5, 11.5, 14.9]
labels = ['6.4-7.5', '7.5-9.5', '9.5-11.5', '11.5-14.9']
sampled['interest_rate_band'] = pd.cut(sampled[interest_col], bins=bins, labels=labels, include_lowest=True, right=True)

# Save to CSV
sampled.to_csv(OUTPUT_PATH, index=False)
print(f"Saved expanded dataset to: {OUTPUT_PATH} (rows: {len(sampled):,})")

# Show a preview and summaries
preview = sampled.head(5)
summary = {
    "interest_rate_describe": sampled[interest_col].describe().to_dict(),
    "interest_rate_band_counts": sampled['interest_rate_band'].value_counts().to_dict(),
    "regions_counts": sampled['region'].value_counts().head(10).to_dict(),
    "gender_counts": sampled['gender'].value_counts().to_dict(),
    "marriage_status_counts": sampled['marriage_status'].value_counts().to_dict(),
    "numeric_sample_preview": sampled[num_cols].describe().loc[['mean','std']].to_dict() if num_cols else {}
}

try:
    from caas_jupyter_tools import display_dataframe_to_user
    display_dataframe_to_user("Expanded sample preview (first 100 rows)", sampled.head(100))
except Exception:
    pass

print("\n-- Interest rate summary --")
print(pd.Series(sampled[interest_col]).describe())
print("\n-- Gender distribution --")
print(sampled['gender'].value_counts(normalize=True).round(3))
print("\n-- Marriage status distribution --")
print(sampled['marriage_status'].value_counts(normalize=True).round(3))
print("\n-- Interest rate band counts --")
print(sampled['interest_rate_band'].value_counts())
print(f"\nDownload: sandbox:{OUTPUT_PATH}")
print("\n-- Columns in expanded dataset --")
print(sampled.columns.tolist())

sampled.head(10)

Original rows: 32,581  |  Target rows: 500,000
Using interest column: loan_int_rate

-- Employment Length Distribution Summary --
count    465133.000000
mean          7.612071
std           5.009417
min           0.000000
50%           6.000000
75%          10.000000
99%          25.000000
max          57.000000
Name: person_emp_length, dtype: float64
Saved expanded dataset to: C:\Users\danie\Downloads\Git\credit_loan_generation.csv (rows: 500,000)

-- Interest rate summary --
count    452026.000000
mean         10.650164
std           2.456047
min           6.400000
25%           8.526000
50%          10.649000
75%          12.775000
max          14.900000
Name: loan_int_rate, dtype: float64

-- Gender distribution --
gender
Female    0.51
Male      0.49
Name: proportion, dtype: float64

-- Marriage status distribution --
marriage_status
1    0.55
0    0.45
Name: proportion, dtype: float64

-- Interest rate band counts --
interest_rate_band
11.5-14.9    180805
7.5-9.5      106565
9.5-

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,country,region,city,gender,marriage_status,interest_rate_band
0,27.0,64364.269183,MORTGAGE,9.0,VENTURE,A,7991.286447,7.896,0,0.119503,N,7.955148,United Kingdom,England,Southampton,Male,0,7.5-9.5
1,26.0,174509.456983,MORTGAGE,8.0,PERSONAL,C,15002.972029,,0,0.091129,N,2.928321,United Kingdom,England,Birmingham,Female,1,
2,26.0,148408.598456,RENT,8.0,DEBTCONSOLIDATION,E,19962.726928,14.9,1,0.14034,N,2.986315,United Kingdom,England,London,Male,0,11.5-14.9
3,25.0,69249.920195,RENT,,HOMEIMPROVEMENT,A,8518.077401,7.295,1,0.119508,N,4.020352,United Kingdom,Wales,Swansea,Female,0,6.4-7.5
4,39.0,38752.646731,MORTGAGE,21.0,MEDICAL,C,3344.44028,13.259,0,0.089805,Y,17.017967,United Kingdom,England,Manchester,Male,0,11.5-14.9
5,30.0,34709.26723,RENT,3.2,MEDICAL,A,6055.390144,7.96,0,0.169961,N,8.979475,United Kingdom,England,London,Female,0,7.5-9.5
6,24.0,77877.054269,MORTGAGE,6.0,DEBTCONSOLIDATION,A,3639.235228,6.557,0,0.049156,N,3.998463,United Kingdom,England,Birmingham,Male,0,6.4-7.5
7,26.0,72223.483043,MORTGAGE,8.0,EDUCATION,A,10676.569453,6.932,0,0.149945,N,3.958605,United Kingdom,Northern Ireland,Newry,Male,1,6.4-7.5
8,31.0,53315.551075,MORTGAGE,13.0,PERSONAL,B,15298.048182,12.155,0,0.290319,N,8.04493,United Kingdom,England,Sheffield,Male,0,11.5-14.9
9,26.0,30749.572854,RENT,8.0,DEBTCONSOLIDATION,A,5984.79579,8.221,0,0.20172,N,3.000758,United Kingdom,England,London,Female,1,7.5-9.5


In [21]:
sampled.shape

(500000, 18)

In [22]:
sampled['person_emp_length'].describe

<bound method NDFrame.describe of 0          9.0
1          8.0
2          8.0
3          NaN
4         21.0
          ... 
499995     4.0
499996     3.6
499997     5.0
499998     5.0
499999     8.0
Name: person_emp_length, Length: 500000, dtype: float64>