# Credit Scoring Model

#### **Goal:** Predict the likelihood of loan repayment or default (binary classification).
#### **Output:** A credit score or risk label (e.g., "good credit" or "bad credit").

In [5]:
pip install xlrd

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

df = pd.read_excel("/kaggle/input/ccard-classification/default of credit card clients.xls", header=1)

df

Unnamed: 0.1,Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


### Data Cleaning

In [7]:
#dropping UID column since it will not be trained on
df = df.drop("Unnamed: 0", axis=1)

In [8]:
# Cleaning column names (Removing extra spaces and encoding issues)
df.columns = df.columns.str.strip()

In [9]:
print(df.columns)

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')


In [11]:
# Ensuring all numeric columns are properly converted/formatted
numeric_columns = [
    'LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
    'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'
]

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

### Feature Engineering

In [14]:
X = df.drop("default payment next month", axis=1)  # Features
y = df["default payment next month"]  # Target variable

In [15]:
# Defining the main categorical features
# X3 represents SEX
# X4 represents EDUCATION
# X5 represents MARRIAGE STATUS
categorical_features = ["EDUCATION", "MARRIAGE", "AGE"]

### Splitting the Data

In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to CatBoost Pool
train_pool = Pool(X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(X_test, label=y_test, cat_features=categorical_features)

### Training the Model

In [31]:
# Train CatBoost model
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    cat_features=categorical_features,
    verbose=100
)
model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

0:	test: 0.7351760	best: 0.7351760 (0)	total: 23.4ms	remaining: 23.4s
100:	test: 0.7793896	best: 0.7793896 (100)	total: 2.2s	remaining: 19.6s
200:	test: 0.7829582	best: 0.7829655 (194)	total: 4.31s	remaining: 17.1s
300:	test: 0.7840253	best: 0.7840539 (299)	total: 6.53s	remaining: 15.2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7846153196
bestIteration = 349

Shrink model to first 350 iterations.


<catboost.core.CatBoostClassifier at 0x7b5f126c7100>

### Predicting and Evaluating Model Accuracy

In [32]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

### Metrics

In [33]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8198333333333333
ROC-AUC: 0.784615319617337

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.95      0.89      4687
           1       0.67      0.35      0.46      1313

    accuracy                           0.82      6000
   macro avg       0.75      0.65      0.68      6000
weighted avg       0.80      0.82      0.80      6000



In [34]:
model.save_model("/kaggle/working/catboost_credit_model.cbm", format="cbm")