In [1]:
import pandas as pd

df = pd.read_csv('GiveMeSomeCredit/cs-training.csv', index_col=0)  
print("Shape:", df.shape)
print("Missing values:\n", df.isnull().sum())
print("Target distribution:\n", df['SeriousDlqin2yrs'].value_counts())

Shape: (150000, 11)
Missing values:
 SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64
Target distribution:
 SeriousDlqin2yrs
0    139974
1     10026
Name: count, dtype: int64


In [2]:
df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(), inplace=True)
df['NumberOfDependents'].fillna(df['NumberOfDependents'].median(), inplace=True)

print("Missing after fill:\n", df.isnull().sum())

Missing after fill:
 SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['NumberOfDependents'].fillna(df['NumberOfDependents'].median(), inplace=True)


In [4]:
!pip install imblearn --quiet


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X = df.drop('SeriousDlqin2yrs', axis=1)
y = df['SeriousDlqin2yrs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_resampled.value_counts())


Before SMOTE: SeriousDlqin2yrs
0    111930
1      8070
Name: count, dtype: int64
After SMOTE: SeriousDlqin2yrs
0    111930
1    111930
Name: count, dtype: int64


In [6]:
import numpy as np

X_train_resampled['DebtPerIncome'] = X_train_resampled['DebtRatio'] / (X_train_resampled['MonthlyIncome'] + 1)
X_test['DebtPerIncome'] = X_test['DebtRatio'] / (X_test['MonthlyIncome'] + 1)

X_train_resampled['AgeGroup'] = pd.cut(X_train_resampled['age'], bins=[0, 30, 50, 100], labels=[0, 1, 2])
X_test['AgeGroup'] = pd.cut(X_test['age'], bins=[0, 30, 50, 100], labels=[0, 1, 2])

X_train_resampled['TotalLate'] = (
    X_train_resampled['NumberOfTime30-59DaysPastDueNotWorse'] +
    X_train_resampled['NumberOfTime60-89DaysPastDueNotWorse'] +
    X_train_resampled['NumberOfTimes90DaysLate']
)
X_test['TotalLate'] = (
    X_test['NumberOfTime30-59DaysPastDueNotWorse'] +
    X_test['NumberOfTime60-89DaysPastDueNotWorse'] +
    X_test['NumberOfTimes90DaysLate']
)

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

cols_to_scale = ['RevolvingUtilizationOfUnsecuredLines', 'age', 'DebtRatio',
                 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
                 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
                 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse',
                 'NumberOfDependents', 'DebtPerIncome', 'TotalLate']

X_train_resampled[cols_to_scale] = scaler.fit_transform(X_train_resampled[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

In [8]:
print(X_train_resampled[['DebtPerIncome', 'AgeGroup', 'TotalLate']].head())
print(X_train_resampled.describe()[['DebtPerIncome', 'TotalLate']])

   DebtPerIncome AgeGroup  TotalLate
0      -0.058547        0   -0.11708
1      -0.058546        2   -0.11708
2      -0.058547        1   -0.11708
3      -0.058547        1   -0.11708
4      -0.058547        1   -0.11708
       DebtPerIncome     TotalLate
count   2.238600e+05  2.238600e+05
mean    6.982909e-19  1.104569e-17
std     1.000002e+00  1.000002e+00
min    -5.854747e-02 -1.170798e-01
25%    -5.854735e-02 -1.170798e-01
50%    -5.854719e-02 -1.170798e-01
75%    -5.854648e-02 -7.642783e-02
max     1.810809e+02  1.183459e+01


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest Results:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba_rf))

Random Forest Results:
[[25633  2411]
 [  992   964]]
              precision    recall  f1-score   support

           0       0.96      0.91      0.94     28044
           1       0.29      0.49      0.36      1956

    accuracy                           0.89     30000
   macro avg       0.62      0.70      0.65     30000
weighted avg       0.92      0.89      0.90     30000

ROC-AUC Score: 0.8280018140497302


In [11]:
X_train_resampled['DebtPerIncome'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_test['DebtPerIncome'].replace([np.inf, -np.inf], np.nan, inplace=True)

X_train_resampled.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_resampled['DebtPerIncome'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['DebtPerIncome'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_resampled, y_train_resampled)

y_pred_gb = gb.predict(X_test)
y_proba_gb = gb.predict_proba(X_test)[:, 1]

print("Gradient Boosting Results:")
print(confusion_matrix(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba_gb))

Gradient Boosting Results:
[[24960  3084]
 [  800  1156]]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93     28044
           1       0.27      0.59      0.37      1956

    accuracy                           0.87     30000
   macro avg       0.62      0.74      0.65     30000
weighted avg       0.92      0.87      0.89     30000

ROC-AUC Score: 0.8407389104296813


In [13]:
!pip install xgboost --quiet


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
X_train_resampled['AgeGroup'] = X_train_resampled['AgeGroup'].astype(int)
X_test['AgeGroup'] = X_test['AgeGroup'].astype(int)

In [16]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_resampled, y_train_resampled)

y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

print("XGBoost Results:")
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Results:
[[25324  2720]
 [  980   976]]
              precision    recall  f1-score   support

           0       0.96      0.90      0.93     28044
           1       0.26      0.50      0.35      1956

    accuracy                           0.88     30000
   macro avg       0.61      0.70      0.64     30000
weighted avg       0.92      0.88      0.89     30000

ROC-AUC Score: 0.8214542864134916
