# Credit Scoring Model — Starter Notebook
This notebook walks you through building a **credit scoring** classifier that predicts whether a customer will **default (1)** or **not default (0)**. You can use the provided synthetic dataset `credit_synthetic.csv` or swap in a real dataset with similar columns.

**Steps:** data loading → EDA → preprocessing → baseline models → evaluation → tuning → saving model → inference.

In [None]:
# Imports
import warnings; warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import joblib

DATA_PATH = 'credit_synthetic.csv'  # change to your CSV path if needed


In [None]:
# Load data
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


In [None]:
# Quick EDA
print("Columns:", list(df.columns))
print("\nMissing values per column:\n", df.isna().sum())
print("\nClass balance (0=non-default, 1=default):\n", df['default'].value_counts(normalize=True))

# Numeric summary
df.describe(include='all').T.head(20)


In [None]:
# Split features/target
target = 'default'
X = df.drop(columns=[target])
y = df[target]

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
numeric_cols, categorical_cols


In [None]:
# Preprocessing: impute + scale numeric, impute + one-hot encode categorical
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_train.shape, X_test.shape


In [None]:
# Baseline models
models = {
    'LogisticRegression': LogisticRegression(max_iter=400, class_weight='balanced', random_state=42),
    'DecisionTree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'RandomForest': RandomForestClassifier(class_weight='balanced', n_estimators=200, random_state=42)
}

scoring = 'roc_auc'
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, clf in models.items():
    pipe = Pipeline(steps=[('preprocess', preprocess), ('clf', clf)])
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    print(f"{name}: {scoring} CV = {scores.mean():.3f} ± {scores.std():.3f}")


In [None]:
# Choose RandomForest (often strong) and evaluate on test
best_clf = RandomForestClassifier(class_weight='balanced', n_estimators=300, random_state=42)
pipe = Pipeline(steps=[('preprocess', preprocess), ('clf', best_clf)])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# ROC Curve
RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("ROC Curve — RandomForest")
plt.show()


In [None]:
# Hyperparameter tuning (quick grid)
param_grid = {
    'clf__n_estimators': [200, 400],
    'clf__max_depth': [None, 8, 12],
    'clf__min_samples_split': [2, 10, 20],
    'clf__max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(
    Pipeline([('preprocess', preprocess), ('clf', RandomForestClassifier(class_weight='balanced', random_state=42))]),
    param_grid=param_grid, scoring='roc_auc', cv=3, n_jobs=-1, verbose=1
)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV ROC-AUC:", grid.best_score_)

best_model = grid.best_estimator_
y_proba = best_model.predict_proba(X_test)[:, 1]
print("Test ROC-AUC with tuned model:", roc_auc_score(y_test, y_proba))


In [None]:
# Save the tuned model
Path('models').mkdir(exist_ok=True)
joblib.dump(best_model, 'models/credit_scoring_model.joblib')

# Example: load and predict on a single applicant
loaded = joblib.load('models/credit_scoring_model.joblib')

example = pd.DataFrame([{
    'age': 30, 'annual_income_inr': 800000, 'debt_to_income': 0.35, 'credit_utilization': 0.4,
    'late_payments_12m': 1, 'accounts_open': 5, 'loan_amount_inr': 150000,
    'employment_length_years': 3, 'has_mortgage': 0, 'is_self_employed': 0,
    'region': 'South', 'payment_history_score': 0.72
}])

prob_default = loaded.predict_proba(example)[:, 1][0]
print(f"Predicted probability of default: {prob_default:.3f}")
