## CHAVAN ADVAIT GURUNATH 
## advaitchavan135@gmail.com
## Task 3: Credit risk analysis

### The risk manager has collected data on the loan borrowers. The data is in tabular format, with each row providing details of the borrower, including their income, total loans outstanding, and a few other metrics. There is also a column indicating if the borrower has previously defaulted on a loan. You must use this data to build a model that, given details for any loan described above, will predict the probability that the borrower will default (also known as PD: the probability of default). Use the provided data to train a function that will estimate the probability of default for a borrower. Assuming a recovery rate of 10%, this can be used to give the expected loss on a loan.

### You should produce a function that can take in the properties of a loan and output the expected loss.
### You can explore any technique ranging from a simple regression or a decision tree to something more advanced. You can also use multiple methods and provide a comparative analysis.

In [7]:
import pandas as pd
import numpy as np

In [8]:
from sklearn.model_selection       import train_test_split
from sklearn.pipeline              import Pipeline
from sklearn.compose               import ColumnTransformer
from sklearn.preprocessing         import StandardScaler, OneHotEncoder
from sklearn.linear_model          import LogisticRegression
from sklearn.tree                  import DecisionTreeClassifier
from sklearn.ensemble              import RandomForestClassifier
from sklearn.calibration           import CalibratedClassifierCV
from sklearn.metrics               import roc_auc_score, brier_score_loss

In [2]:
df = pd.read_csv('Loan_data.csv')

In [3]:
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


In [5]:
df.describe()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4974577.0,1.4612,4159.677034,8718.916797,70039.901401,4.5528,637.5577,0.1851
std,2293890.0,1.743846,1421.399078,6627.164762,20072.214143,1.566862,60.657906,0.388398
min,1000324.0,0.0,46.783973,31.652732,1000.0,0.0,408.0,0.0
25%,2977661.0,0.0,3154.235371,4199.83602,56539.867903,3.0,597.0,0.0
50%,4989502.0,1.0,4052.377228,6732.407217,70085.82633,5.0,638.0,0.0
75%,6967210.0,2.0,5052.898103,11272.26374,83429.166133,6.0,679.0,0.0
max,8999789.0,5.0,10750.67781,43688.7841,148412.1805,10.0,850.0,1.0


## Create ratio and risk‐band features:

In [9]:
df["loan_to_income"]  = df["loan_amt_outstanding"] / df["income"]
df["debt_to_credit"]  = df["total_debt_outstanding"] / df["credit_lines_outstanding"]
df["fico_band"] = pd.cut(
    df["fico_score"],
    bins=[300, 580, 670, 740, 800, 850],
    labels=["Poor","Fair","Good","VeryGood","Exceptional"]
)

In [10]:
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default,loan_to_income,debt_to_credit,fico_band
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0,0.066909,inf,Fair
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1,0.07351,1645.750504,Poor
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0,0.051058,inf,Fair
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0,0.064105,inf,Fair
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0,0.057395,1768.826187,Fair


## Prepare X, y and Train/Test Split

In [19]:
TARGET = "default"
X = df.drop(columns=[TARGET])
y = df[TARGET]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [24]:
from sklearn.impute import SimpleImputer

X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test  = X_test.replace([np.inf, -np.inf], np.nan)


imputer = SimpleImputer(strategy="median")
X_train[numeric_features] = imputer.fit_transform(
    X_train[numeric_features]
)
X_test[numeric_features]  = imputer.transform(
    X_test[numeric_features]
)


## Preprocessing Pipeline

In [25]:
numeric_features      = [
    "fico_score",
    "credit_lines_outstanding",
    "loan_amt_outstanding",
    "total_debt_outstanding",
    "income",
    "years_employed",
    "loan_to_income",
    "debt_to_credit"
]
categorical_features  = ["fico_band"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop="first"), categorical_features),
])

## Define and Train Models

In [26]:
models = {
    "LogisticRegression": Pipeline([
        ("prep", preprocessor),
        ("clf", LogisticRegression(solver="liblinear"))
    ]),
    "DecisionTree": Pipeline([
        ("prep", preprocessor),
        ("clf", DecisionTreeClassifier(max_depth=5, random_state=42))
    ]),
    "RandomForest": Pipeline([
        ("prep", preprocessor),
        ("clf", CalibratedClassifierCV(
            estimator=RandomForestClassifier(
                n_estimators=100, random_state=42),
            method="isotonic", cv=5))
    ]),
}

In [27]:
results = []
for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    probs = pipe.predict_proba(X_test)[:, 1]
    results.append({
        "model": name,
        "AUC": roc_auc_score(y_test, probs),
        "Brier": brier_score_loss(y_test, probs)
    })

In [28]:
results_df = pd.DataFrame(results).sort_values("AUC", ascending=False)
print(results_df)

                model       AUC     Brier
0  LogisticRegression  0.999990  0.003895
2        RandomForest  0.999826  0.003671
1        DecisionTree  0.999087  0.004071


## Select Best Model

In [29]:
best_model_name = results_df.iloc[0]["model"]
best_model = models[best_model_name]
print(f"\nSelected model: {best_model_name}")


Selected model: LogisticRegression


### Creating a function

In [30]:
RECOVERY_RATE = 0.10

def expected_loss(credit_lines_outstanding: int,
                  loan_amt_outstanding: float,
                  total_debt_outstanding: float,
                  income: float,
                  years_employed: int,
                  fico_score: int) -> float:
    row = pd.DataFrame([{
        "credit_lines_outstanding": credit_lines_outstanding,
        "loan_amt_outstanding": loan_amt_outstanding,
        "total_debt_outstanding": total_debt_outstanding,
        "income": income,
        "years_employed": years_employed,
        "fico_score": fico_score
    }])
    row["loan_to_income"] = row["loan_amt_outstanding"] / row["income"]
    row["debt_to_credit"] = row["total_debt_outstanding"] / row["credit_lines_outstanding"]
    row["fico_band"]      = pd.cut(
        row["fico_score"],
        bins=[300, 580, 670, 740, 800, 850],
        labels=["Poor","Fair","Good","VeryGood","Exceptional"]
    )
    pd_prob  = best_model.predict_proba(row)[:, 1][0]
    exposure = loan_amt_outstanding  # define EAD as outstanding loan amount
    exp_loss = pd_prob * exposure * (1 - RECOVERY_RATE)
    return exp_loss

In [31]:
loss_example = expected_loss(
    credit_lines_outstanding=10,
    loan_amt_outstanding=1500,
    total_debt_outstanding=3200,
    income=55000,
    years_employed=4,
    fico_score=710
)
print(f"\nExpected Loss = {loss_example:.2f}")


Expected Loss = 1350.00
