In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path

# Import libararies for metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report 


# import StandardSclaer and OneHotEncoder
from sklearn.preprocessing import StandardScaler,OneHotEncoder

# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Import Adaboost library
from sklearn.ensemble import AdaBoostClassifier


import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import Credit Risk information to Google Colab
url = 'https://raw.githubusercontent.com/DIsaacman/Dream-Team/dataset/Resources/credit_risk_dataset.csv'

# Read CSV
df = pd.read_csv(url)

# Review the DataFrame
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
# drop the columns we don't want to use in our models
consolidated_df = df.drop(columns=[
       'person_emp_length', 'loan_intent', 'loan_grade',
       'loan_int_rate', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'])

In [4]:
# Create a list of categorical variables 
categorical_variables = list(consolidated_df.dtypes[consolidated_df.dtypes=="object"].index)

# Display the categorical variables list
categorical_variables

['person_home_ownership']

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [6]:
# fit and transform the categorial variable
encoded_data = enc.fit_transform(consolidated_df[categorical_variables])

In [7]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
columns=enc.get_feature_names_out(categorical_variables)
)

# Review the DataFrame
encoded_df.head()

Unnamed: 0,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT
0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0


In [8]:
# create a new df that does not include those columns we converted with one_hot_encoder (all numerical variables from original dataset)

numerical_variables_df = consolidated_df.drop(columns=categorical_variables)

#review the dataframe
numerical_variables_df.head()

Unnamed: 0,person_age,person_income,loan_amnt,loan_status
0,22,59000,35000,1
1,21,9600,1000,0
2,25,9600,5500,1
3,23,65500,35000,1
4,24,54400,35000,1


In [9]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame

encoded_df = pd.concat([encoded_df, numerical_variables_df], axis=1)

# Review the Dataframe
encoded_df.head()

Unnamed: 0,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,person_age,person_income,loan_amnt,loan_status
0,0.0,0.0,0.0,1.0,22,59000,35000,1
1,0.0,0.0,1.0,0.0,21,9600,1000,0
2,1.0,0.0,0.0,0.0,25,9600,5500,1
3,0.0,0.0,0.0,1.0,23,65500,35000,1
4,0.0,0.0,0.0,1.0,24,54400,35000,1


In [10]:
# separate the data into features and target

y = encoded_df["loan_status"]

In [11]:
y[:7]


0    1
1    0
2    1
3    1
4    1
5    1
6    1
Name: loan_status, dtype: int64

In [12]:
X = encoded_df.drop(columns="loan_status")


In [13]:
X.head()


Unnamed: 0,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,person_age,person_income,loan_amnt
0,0.0,0.0,0.0,1.0,22,59000,35000
1,0.0,0.0,1.0,0.0,21,9600,1000
2,1.0,0.0,0.0,0.0,25,9600,5500
3,0.0,0.0,0.0,1.0,23,65500,35000
4,0.0,0.0,0.0,1.0,24,54400,35000


In [14]:
# check the value_counts to see whether there is an oversampling issue
y.value_counts()

0    25473
1     7108
Name: loan_status, dtype: int64

In [15]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=100,
                         learning_rate=1)

In [18]:
# Train Adaboost Classifer
abc_model = abc.fit(X_train_scaled, y_train)

In [19]:
#Predict the response for test dataset
abc_pred = abc_model.predict(X_test_scaled)

In [20]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, abc_pred))

Accuracy: 0.8423766265651854


In [21]:
print(classification_report(y_test, abc_pred)) 


              precision    recall  f1-score   support

           0       0.84      0.98      0.91      6360
           1       0.84      0.35      0.49      1786

    accuracy                           0.84      8146
   macro avg       0.84      0.67      0.70      8146
weighted avg       0.84      0.84      0.82      8146



In [22]:
balanced_accuracy_score(abc_pred,y_test)

0.8397170494891675