<a href="https://colab.research.google.com/github/AbhishekA87/ML_Files/blob/main/Credit_Risk_scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Credit Risk Scoring (Core Banking Use Case)
Objective: Predicting whether a loan applicant is high or low risk.
Dataset: German Credit Dataset (UCI)
Models: Logistic Regression, Random Forest, XGBoost
Business Impact: Automates loan approvals, reduces default risk

In [2]:
#Connecting the Dataset
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Part 1. Importing the libraries and datasets

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [64]:
ucidata = pd.read_csv('/content/drive/MyDrive/ML/Portfolio/statlog+german+credit+data/german.data-numeric', sep='\s+', header = None)

  ucidata = pd.read_csv('/content/drive/MyDrive/ML/Portfolio/statlog+german+credit+data/german.data-numeric', sep='\s+', header = None)


In [65]:
ucidata.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1,6,4,12,5,5,3,4,1,67,...,0,0,1,0,0,1,0,0,1,1
1,2,48,2,60,1,3,2,2,1,22,...,0,0,1,0,0,1,0,0,1,2
2,4,12,4,21,1,4,3,3,1,49,...,0,0,1,0,0,1,0,1,0,1
3,1,42,2,79,1,4,3,4,2,45,...,0,0,0,0,0,0,0,0,1,1
4,1,24,3,49,1,3,3,4,4,53,...,1,0,1,0,0,0,0,0,1,2


In [66]:
#Rename the columns
feature_names = [f"feature_{i}" for i in range(1, ucidata.shape[1])]
ucidata.columns = feature_names + ['target']

In [67]:
print(ucidata.head())
print(ucidata['target'].value_counts())


   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0          1          6          4         12          5          5   
1          2         48          2         60          1          3   
2          4         12          4         21          1          4   
3          1         42          2         79          1          4   
4          1         24          3         49          1          3   

   feature_7  feature_8  feature_9  feature_10  ...  feature_16  feature_17  \
0          3          4          1          67  ...           0           0   
1          2          2          1          22  ...           0           0   
2          3          3          1          49  ...           0           0   
3          3          4          2          45  ...           0           0   
4          3          4          4          53  ...           1           0   

   feature_18  feature_19  feature_20  feature_21  feature_22  feature_23  \
0           1        

In [68]:
#Defining the Features and Target
X = ucidata.drop('target', axis=1)
y = ucidata['target'].map({1:0, 2:1})  # 0 = Good, 1 = Bad

#Split data into training, test set

In [69]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state = 0)

In [19]:
#handling the categorical data
cat_clmn = ucidata.select_dtypes(include=["object"]).columns
for col in cat_clmn:
    le = LabelEncoder()
    ucidata[col] = le.fit_transform(ucidata[col])

In [70]:
#Using the StandardScaler for standardisation
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#TRAIN MODELS

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

In [75]:
models = {
    "Logistic Regression" : LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder = False, eval_metric= "logloss")
}
for name, model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  auc=roc_auc_score(y_test, model.predict_proba(X_test)[:,1] )
  print(f"{name} AUC: {auc}")
  accuracy = accuracy_score(y_test, y_pred)
  print(f"{name} accuracy: {accuracy}")
  print(classification_report(y_test, y_pred))

Logistic Regression AUC: 0.7219524040796503
Logistic Regression accuracy: 0.76
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       142
           1       0.59      0.59      0.59        58

    accuracy                           0.76       200
   macro avg       0.71      0.71      0.71       200
weighted avg       0.76      0.76      0.76       200

Random Forest AUC: 0.753763963088878
Random Forest accuracy: 0.75
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       142
           1       0.59      0.47      0.52        58

    accuracy                           0.75       200
   macro avg       0.69      0.67      0.68       200
weighted avg       0.74      0.75      0.74       200

XGBoost AUC: 0.7109033511413309
XGBoost accuracy: 0.72
              precision    recall  f1-score   support

           0       0.81      0.80      0.80       142
           1       0.52      0.53    

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [81]:
# Evaluating the data on train, test data
for name, model in models.items():
  y_train_pred = model.predict(X_train)
print(y_train_pred[4])

0


# Exporting the Predictions to PowerBI

In [83]:
best_model = models["Logistic Regression"]

predictions = pd.DataFrame({
    "ApplicantID": X.index[y_test.index],  # keep original index
    "Actual": y_test,
    "Predicted": best_model.predict(X_test),
    "Probability_Bad": best_model.predict_proba(X_test)[:,1]
})

predictions.to_csv("credit_risk_predictions.csv", index=False)
