In [3]:
import os, json, pandas as pd, numpy as np
from pathlib import Path

DATA_PATH = Path("data/raw/german_credit.csv")
Path("reports").mkdir(parents=True, exist_ok=True)
Path("splits").mkdir(parents=True, exist_ok=True)


In [4]:
from sklearn.datasets import fetch_openml

DATA_PATH = Path("data/raw/german_credit.csv")

if DATA_PATH.exists():
    df = pd.read_csv(DATA_PATH)
else:
    try:
        # try by name
        data = fetch_openml(name="credit-g", as_frame=True, parser="pandas")
    except Exception as e1:
        # fallback: try by ID
        data = fetch_openml(data_id=31, as_frame=True, parser="pandas")

    df = data.frame
    # make binary target
    if "class" in df.columns:
        df["default"] = (df["class"] == "bad").astype(int)
        df = df.drop(columns=["class"])
    # tidy names
    df.columns = [c.replace(".", "_").lower() for c in df.columns]
    # persist a clean copy
    DATA_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(DATA_PATH, index=False)

df.shape, df["default"].mean()



- version 1, status: active
  url: https://www.openml.org/search?type=data&id=31
- version 2, status: active
  url: https://www.openml.org/search?type=data&id=44096



((1000, 21), 0.3)

In [7]:
from pandas.api.types import is_numeric_dtype, is_categorical_dtype, is_object_dtype, is_bool_dtype

print("shape:", df.shape)
print("target rate:", df["default"].mean())

# Identify dtypes safely
numeric = [c for c in df.columns if c != "default" and is_numeric_dtype(df[c])]
categorical = [
    c for c in df.columns
    if c != "default" and (
        isinstance(df[c].dtype, pd.CategoricalDtype) or
        df[c].dtype == "object" or
        is_bool_dtype(df[c])
    )
]

missing = df.isna().sum().sort_values(ascending=False)
cardinality = df.nunique().sort_values(ascending=False)

summary = {
    "rows": int(df.shape[0]),
    "cols": int(df.shape[1]),
    "positive_rate": float(df["default"].mean()),
    "n_numeric": len(numeric),
    "n_categorical": len(categorical),
}

display(df.head(3))
summary

shape: (1000, 21)
target rate: 0.3


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,default
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,0
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,1
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,0


{'rows': 1000,
 'cols': 21,
 'positive_rate': 0.3,
 'n_numeric': 7,
 'n_categorical': 13}

In [8]:
os.makedirs("reports", exist_ok=True)

eda = {
    "rows": int(df.shape[0]),
    "cols": int(df.shape[1]),
    "positive_rate": float(df["default"].mean()),
    "numeric": numeric,
    "categorical": categorical,
    "top_missing": missing.head(10).to_dict(),
    "top_cardinality": cardinality.head(10).to_dict(),
}
with open("reports/eda_summary.json", "w") as f:
    json.dump(eda, f, indent=2)
print("saved: reports/eda_summary.json")

saved: reports/eda_summary.json


In [9]:
from sklearn.model_selection import train_test_split
os.makedirs("splits", exist_ok=True)

y = df["default"]
X = df.drop(columns=["default"])

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.40, stratify=y, random_state=42
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

pd.DataFrame({"index": X_train.index}).to_csv("splits/train_idx.csv", index=False)
pd.DataFrame({"index": X_valid.index}).to_csv("splits/valid_idx.csv", index=False)
pd.DataFrame({"index": X_test.index}).to_csv("splits/test_idx.csv", index=False)
print("splits saved")


splits saved


In [10]:
schema = {"numeric": numeric, "categorical": categorical, "target": "default"}
with open("reports/feature_schema.json", "w") as f:
    json.dump(schema, f, indent=2)
print("saved: reports/feature_schema.json")


saved: reports/feature_schema.json


In [1]:
from pathlib import Path
print("CWD:", Path.cwd())
print("Repo exists here? ", (Path.home() / "Documents" / "credit-risk-scorecard").exists())


CWD: C:\Users\balla\Documents\credit-risk-scorecard\notebooks
Repo exists here?  True


In [None]:
import shutil

REPO = Path.home() / "Documents" / "credit-risk-scorecard"
NB   = REPO / "notebooks"

src_csv   = NB / "data" / "raw" / "german_credit.csv"
src_splits= NB / "splits"
src_reports = NB / "reports"

# destinations (repo root)
dst_csv   = REPO / "data" / "raw" / "german_credit.csv"
dst_splits= REPO / "splits"
dst_reports = REPO / "reports"

dst_csv.parent.mkdir(parents=True, exist_ok=True)
dst_splits.mkdir(parents=True, exist_ok=True)
dst_reports.mkdir(parents=True, exist_ok=True)

# move CSV
if src_csv.exists():
    shutil.move(str(src_csv), str(dst_csv))

# move split CSVs
if src_splits.exists():
    for p in src_splits.glob("*.csv"):
        shutil.move(str(p), str(dst_splits / p.name))

# move report JSONs
if src_reports.exists():
    for p in src_reports.glob("*.json"):
        shutil.move(str(p), str(dst_reports / p.name))

print("moved files (if any) to repo root")
