In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.ensemble import HistGradientBoostingClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.datasets import fetch_openml

In [24]:
credit_risk = fetch_openml( data_id = 31 )

credit_risk_data = credit_risk.data
credit_risk_data['type_code'] = credit_risk.target.astype('category').cat.codes

In [26]:
credit_risk_data.sample(5)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,type_code
969,<0,11,critical/other existing credit,new car,3939,<100,1<=X<4,1,male single,none,...,real estate,40,none,own,2,unskilled resident,2,none,yes,1
337,<0,15,existing paid,domestic appliance,1275,no known savings,1<=X<4,4,female div/dep/mar,none,...,car,24,none,rent,1,skilled,1,none,yes,0
211,no checking,36,existing paid,radio/tv,3835,no known savings,>=7,2,female div/dep/mar,none,...,real estate,45,none,own,1,unskilled resident,1,yes,yes,1
391,0<=X<200,12,existing paid,furniture/equipment,983,>=1000,<1,1,female div/dep/mar,none,...,real estate,19,none,rent,1,unskilled resident,1,none,yes,1
352,no checking,18,critical/other existing credit,used car,3229,no known savings,unemployed,2,male single,none,...,no known property,38,none,own,1,high qualif/self emp/mgmt,1,yes,yes,1


In [27]:
credit_risk_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   checking_status         1000 non-null   category
 1   duration                1000 non-null   int64   
 2   credit_history          1000 non-null   category
 3   purpose                 1000 non-null   category
 4   credit_amount           1000 non-null   int64   
 5   savings_status          1000 non-null   category
 6   employment              1000 non-null   category
 7   installment_commitment  1000 non-null   int64   
 8   personal_status         1000 non-null   category
 9   other_parties           1000 non-null   category
 10  residence_since         1000 non-null   int64   
 11  property_magnitude      1000 non-null   category
 12  age                     1000 non-null   int64   
 13  other_payment_plans     1000 non-null   category
 14  housing                 1

In [28]:
credit_risk_data_category_columns = credit_risk_data.select_dtypes(include = ('category'))

In [31]:
def cat_encoder( df ,
                 columns ):
    
    for column in columns:

        df[f"{column}_code"] = df[column].astype('category').cat.codes

    return df 


In [32]:
credit_risk_data = cat_encoder( df = credit_risk_data ,
                                columns = credit_risk_data_category_columns.columns )

In [34]:
credit_risk_data.sample(5)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,savings_status_code,employment_code,personal_status_code,other_parties_code,property_magnitude_code,other_payment_plans_code,housing_code,job_code,own_telephone_code,foreign_worker_code
167,0<=X<200,11,existing paid,furniture/equipment,1577,>=1000,<1,4,female div/dep/mar,none,...,3,2,0,2,2,1,1,3,0,1
909,0<=X<200,9,existing paid,new car,3195,no known savings,1<=X<4,1,female div/dep/mar,none,...,4,0,0,2,2,1,1,2,0,1
798,no checking,24,delayed previously,new car,717,no known savings,>=7,4,male mar/wid,none,...,4,3,2,2,3,1,1,3,1,1
185,no checking,15,critical/other existing credit,radio/tv,1360,<100,1<=X<4,4,male single,none,...,2,0,3,2,0,1,1,3,0,1
947,no checking,12,existing paid,new car,2859,no known savings,unemployed,4,male single,none,...,4,4,3,2,1,1,1,0,1,1


In [45]:
credit_risk_data_independent = credit_risk_data.select_dtypes(include= ('number')).drop( columns = ['type_code'] )

credit_risk_data_dependent = credit_risk_data['type_code']

In [46]:
credit_risk_feature_scaler = StandardScaler()
credit_risk_data_independent = credit_risk_feature_scaler.fit_transform(credit_risk_data_independent)

In [47]:
credit_risk_data_independent_train , credit_risk_data_independent_test , credit_risk_data_dependent_train , credit_risk_data_dependent_test = train_test_split( 
    credit_risk_data_independent ,
    credit_risk_data_dependent ,
    train_size=0.9,
    stratify=credit_risk_data_dependent ,
    random_state=42
 )

In [50]:
credit_risk_hgb = HistGradientBoostingClassifier(max_iter=1500)

credit_risk_hgb.fit( credit_risk_data_independent_train ,
                     credit_risk_data_dependent_train )


In [51]:
from sklearn.model_selection import cross_val_score

credit_risk_hgb_cross_val_scores = cross_val_score( estimator=credit_risk_hgb,
                                                    X=credit_risk_data_independent_train,
                                                    y=credit_risk_data_dependent_train,
                                                    cv=10,
                                                    scoring="f1_macro" )

In [52]:
np.mean(credit_risk_hgb_cross_val_scores)

0.682687743874042

In [54]:
credit_risk_hgb_cross_val_scores

array([0.57887701, 0.69538462, 0.71486175, 0.6875    , 0.73360337,
       0.70252404, 0.62139423, 0.67878001, 0.71920963, 0.69474279])

In [55]:
credit_risk_hgb = HistGradientBoostingClassifier(max_iter=4500)

credit_risk_hgb.fit( credit_risk_data_independent_train ,
                     credit_risk_data_dependent_train )


In [56]:
credit_risk_hgb_cross_val_scores = cross_val_score( estimator=credit_risk_hgb,
                                                    X=credit_risk_data_independent_train,
                                                    y=credit_risk_data_dependent_train,
                                                    cv=10,
                                                    scoring="f1_macro" )

In [57]:
np.mean(credit_risk_hgb_cross_val_scores)

0.674857857967076

In [58]:
credit_risk_hgb_cross_val_scores

array([0.56521739, 0.69538462, 0.68253968, 0.65909091, 0.73360337,
       0.69246769, 0.61224187, 0.67878001, 0.71920963, 0.71004342])