# AdaBoost

Load packages

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

Load dataset

In [5]:
df = pd.read_csv("/content/credit_risk_dataset_cleaned.csv")

In [10]:
df.head()
df.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

Split dataset into train and test, random state not specified (using 42 as it has been the most common choice in my experience)

In [14]:
df_split = pd.get_dummies(df, drop_first=True)
X = df_split.drop("cb_person_default_on_file_Y", axis=1)
y = df_split["cb_person_default_on_file_Y"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

AdaBoost model

In [15]:
model = AdaBoostClassifier(random_state=42)
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

Outputting required metrics

In [16]:
metrics_train = classification_report(y_train, train_pred, output_dict=True)
metrics_test = classification_report(y_test, test_pred, output_dict=True)

In [17]:
print(metrics_train)

{'False': {'precision': 0.9567664522513607, 'recall': 0.8289480735223193, 'f1-score': 0.8882827528783485, 'support': 18661.0}, 'True': {'precision': 0.5077112893275756, 'recall': 0.8248559258331245, 'f1-score': 0.62854415274463, 'support': 3991.0}, 'accuracy': 0.8282270881158397, 'macro avg': {'precision': 0.7322388707894681, 'recall': 0.8269019996777218, 'f1-score': 0.7584134528114892, 'support': 22652.0}, 'weighted avg': {'precision': 0.8776485308656631, 'recall': 0.8282270881158397, 'f1-score': 0.8425200496674325, 'support': 22652.0}}


Train subset

In [18]:
print(f"accuracy: {metrics_train['accuracy']:.3f}")
print(f"precision_0: {metrics_train['False']['precision']:.3f}")
print(f"precision_1: {metrics_train['True']['precision']:.3f}")
print(f"recall_0: {metrics_train['False']['recall']:.3f}")
print(f"recall_1: {metrics_train['True']['recall']:.3f}")
print(f"f1_0: {metrics_train['False']['f1-score']:.3f}")
print(f"f1_1: {metrics_train['True']['f1-score']:.3f}")

accuracy: 0.828
precision_0: 0.957
precision_1: 0.508
recall_0: 0.829
recall_1: 0.825
f1_0: 0.888
f1_1: 0.629


Test subset

In [19]:
print(f"accuracy: {metrics_test['accuracy']:.3f}")
print(f"precision_0: {metrics_test['False']['precision']:.3f}")
print(f"precision_1: {metrics_test['True']['precision']:.3f}")
print(f"recall_0: {metrics_test['False']['recall']:.3f}")
print(f"recall_1: {metrics_test['True']['recall']:.3f}")
print(f"f1_0: {metrics_test['False']['f1-score']:.3f}")
print(f"f1_1: {metrics_test['True']['f1-score']:.3f}")

accuracy: 0.822
precision_0: 0.953
precision_1: 0.501
recall_0: 0.824
recall_1: 0.814
f1_0: 0.884
f1_1: 0.620


Results from research paper and our attempt vary greatly, this is caused by lack of precise information about how the dataset was prepared and randomstate for data split

# XGBoost

Load packages

In [20]:
import xgboost as xgb

Training model

In [31]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



Predicting results

In [30]:
train_pred = xgb_model.predict(X_train)
test_pred = xgb_model.predict(X_test)

In [29]:
metrics_train = classification_report(y_train, train_pred, output_dict=True)
metrics_test = classification_report(y_test, test_pred, output_dict=True)
print(metrics_train)

{'False': {'precision': 0.9701420271142672, 'recall': 0.9663469267456192, 'f1-score': 0.9682407581411582, 'support': 18661.0}, 'True': {'precision': 0.8454724409448819, 'recall': 0.8609371084941118, 'f1-score': 0.8531346989447548, 'support': 3991.0}, 'accuracy': 0.9477750309023486, 'macro avg': {'precision': 0.9078072340295746, 'recall': 0.9136420176198654, 'f1-score': 0.9106877285429564, 'support': 22652.0}, 'weighted avg': {'precision': 0.9481768002732811, 'recall': 0.9477750309023486, 'f1-score': 0.9479605055253695, 'support': 22652.0}}


Training results

In [28]:
print(f"accuracy: {metrics_train['accuracy']:.3f}")
print(f"precision_0: {metrics_train['False']['precision']:.3f}")
print(f"precision_1: {metrics_train['True']['precision']:.3f}")
print(f"recall_0: {metrics_train['False']['recall']:.3f}")
print(f"recall_1: {metrics_train['True']['recall']:.3f}")
print(f"f1_0: {metrics_train['False']['f1-score']:.3f}")
print(f"f1_1: {metrics_train['True']['f1-score']:.3f}")

accuracy: 0.948
precision_0: 0.970
precision_1: 0.845
recall_0: 0.966
recall_1: 0.861
f1_0: 0.968
f1_1: 0.853


testing results

In [27]:
print(f"accuracy: {metrics_test['accuracy']:.3f}")
print(f"precision_0: {metrics_test['False']['precision']:.3f}")
print(f"precision_1: {metrics_test['True']['precision']:.3f}")
print(f"recall_0: {metrics_test['False']['recall']:.3f}")
print(f"recall_1: {metrics_test['True']['recall']:.3f}")
print(f"f1_0: {metrics_test['False']['f1-score']:.3f}")
print(f"f1_1: {metrics_test['True']['f1-score']:.3f}")

accuracy: 0.824
precision_0: 0.895
precision_1: 0.506
recall_0: 0.891
recall_1: 0.517
f1_0: 0.893
f1_1: 0.511


Results from research paper and our attempt vary greatly, this is caused by lack of precise information about how the dataset was prepared and randomstate for data split