# Loan Approval Prediction — ML Pipeline Notebook

**Goal:** Build a reproducible ML pipeline (preprocessing → model → tuning → save) for the Loan Approval classification task.

In [None]:
# 1) Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report)
from sklearn.inspection import permutation_importance

RANDOM_STATE = 42
DATA_PATH ="../data/train.csv" 
sns.set(style='whitegrid')

## Load data

In [None]:
# Load dataset
df = pd.read_csv(DATA_PATH)
# minimal target mapping and dependents fix
if 'Loan_Status' not in df.columns:
    raise KeyError("Loan_Status not found. Make sure your CSV includes it.")

df = df.copy()
df['Loan_Status'] = df['Loan_Status'].map({'Y':1,'N':0})
if 'Dependents' in df.columns:
    df['Dependents'] = df['Dependents'].replace('3+', 3)
    df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce')

print('Loaded:', df.shape)

## X and y and column types


In [None]:
# Drop identifier if present
if 'Loan_ID' in df.columns:
    df = df.drop(columns=['Loan_ID'])

y = df['Loan_Status']
X = df.drop(columns=['Loan_Status'])

# Detect columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

print('Numeric cols:', numeric_cols)
print('Categorical cols:', cat_cols)

## Build preprocessing pipelines
Numeric: median imputation + scaling (useful for linear models).  
Categorical: mode imputation + one-hot encoding.  
We combine them with `ColumnTransformer`.

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
], remainder='drop')

print('Preprocessor ready')

## Define pipeline with classifier placeholder
We create a pipeline containing preprocessing and a classifier. We'll tune RandomForest via GridSearch.

In [None]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE))
])

print(pipe)

## Train-test split
Use stratify to preserve class ratios.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print('Train:', X_train.shape, 'Test:', X_test.shape)

## Quick cross-validation on training set
This gives a baseline for how the pipeline performs before tuning.

In [None]:
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')
print('CV scores:', np.round(cv_scores,4))
print('CV mean:', cv_scores.mean())

## GridSearchCV (small grid)
We tune a small set of Random Forest hyperparameters to keep compute low.

In [None]:
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 6, 12],
    'clf__min_samples_leaf': [1, 3]
}

grid = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print('Best params:', grid.best_params_)
print('Best CV score:', grid.best_score_)

## Evaluate best model on test set
We print common metrics and show a confusion matrix.

In [None]:
best_pipe = grid.best_estimator_

y_pred = best_pipe.predict(X_test)
y_proba = best_pipe.predict_proba(X_test)[:,1] if hasattr(best_pipe, 'predict_proba') else None

print('Test Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, zero_division=0))
print('Recall:', recall_score(y_test, y_pred, zero_division=0))
print('F1:', f1_score(y_test, y_pred, zero_division=0))
if y_proba is not None:
    print('ROC-AUC:', roc_auc_score(y_test, y_proba))

print('\nClassification Report:\n', classification_report(y_test, y_pred, digits=4))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')
plt.show()

## Cross-validated permutation importance (optional)
This gives a model-agnostic idea of which features matter most.

In [None]:
# Compute permutation importance on test set
try:
    r = permutation_importance(best_pipe, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)
    feat_names = np.array(best_pipe.named_steps['preprocessor'].transformers_[0][2].tolist() + list(best_pipe.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols)))
    imp = pd.Series(r.importances_mean, index=feat_names).sort_values(ascending=False)
    print(imp.head(15))
except Exception as e:
    print('Permutation importance failed:', e)
    print('You can compute permutation importance after ensuring transformer names match.')

## Save the best pipeline
This saves preprocessing + model together so raw data rows can be predicted later.

In [None]:
joblib.dump(best_pipe, 'loan_pipeline_v1.joblib')
print('Pipeline saved to loan_pipeline_v1.joblib')

## Sample usage
After loading the saved pipeline, pass a new raw-record DataFrame with the same columns (no preprocessing required).

In [None]:
# Example: load and predict on one row (uncomment and edit a sample row to try)
# loaded = joblib.load('loan_pipeline_v1.joblib')
# sample = X_test.iloc[[0]]  # replace with a new raw row shaped like X
# print('Pred:', loaded.predict(sample), 'Prob:', loaded.predict_proba(sample)[:,1] if hasattr(loaded,'predict_proba') else None)

## Notes / Next steps
- Expand GridSearch with randomized search for more hyperparameters.  
- Consider using class_weight or resampling if classes are imbalanced.  
- For production, add validation on data drift and fairness checks (e.g., performance by gender or area).