In [17]:
import warnings

# Suppress the Deprecation Warnings.
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Load in the necessary libraries.
import eli5
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from eli5.sklearn import PermutationImportance
from IPython.display import display
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder
from xgboost import XGBClassifier

In [18]:
# Load in the dataset.
df_train = pd.read_csv('Resources/train.csv')
df_val = pd.read_csv('Resources/valid.csv')
df_test = pd.read_csv('Resources/X_test.csv')

In [None]:
# Get a general overview of the dataset.
display(df_train.head())
display(df_train.shape)

## Handling Missing Values


In [None]:
# Identify the columns with missing values.
missing = df_train.isnull().sum()
missing = missing[missing > 0]

pd.set_option('display.max_rows', None)
display(missing)
pd.set_option('display.max_rows', 10)

In [19]:
# Drop the columns where more than 50% of the data is missing.
df_train.dropna(axis='columns', inplace=True, thresh=len(df_train)/2)

In [None]:
missing = df_train.isnull().sum()
missing = missing[missing > 0]

pd.set_option('display.max_rows', None)
display(missing)
pd.set_option('display.max_rows', 10)
display(df_train.shape)

## Detecting and Removing Outliers


In [None]:
display(df_train.select_dtypes(include=['int64', 'float64']).columns)

In [None]:
# The business context should govern how we define and react to outliers.
# The meanings of our findings should be dictated by the underlying context, rather than the number itself.

display(df_train.select_dtypes(include=['int', 'float']).columns)

feature = 'last_pymnt_amnt'
sns.boxplot(x=df_train[feature])
q1 = df_train[feature].quantile(0.25)
q3 = df_train[feature].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
display(lower_bound, upper_bound)
num_values_outside_bounds = ((df_train[feature] < lower_bound) | (df_train[feature] > upper_bound)).sum()
display(num_values_outside_bounds)

## Feature Scaling


In [None]:
# def scale_features(df):
#     # Scale numerical columns using MinMaxScaler, excluding the target column if it exists.
#     scaler = MinMaxScaler()

#     numerical_columns = df.select_dtypes(include=['int', 'float']).columns
#     if 'loan_status' in numerical_columns:
#         numerical_columns = numerical_columns.drop('loan_status')
#     df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


# scale_features(df_train)
# scale_features(df_val)
# scale_features(df_test)

## Feature Encoding


In [20]:
label_encoder = LabelEncoder()
ordinal_encoder = OrdinalEncoder()

# Display the categorical columns.
display(df_train.select_dtypes(include=['object']).columns)


def encode_cat_cols(df):
    # display(df['term'].value_counts())
    df['term'] = df['term'].map({' 60 months': 0, ' 36 months': 1})

    # display(df['grade'].value_counts())
    mapping_dict = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}
    df['grade'] = df['grade'].map(mapping_dict)

    # display(df['sub_grade'].value_counts())
    mapping_dict = {
        'A1': 0, 'A2': 1, 'A3': 2, 'A4': 3, 'A5': 4,
        'B1': 5, 'B2': 6, 'B3': 7, 'B4': 8, 'B5': 9,
        'C1': 10, 'C2': 11, 'C3': 12, 'C4': 13, 'C5': 14,
        'D1': 15, 'D2': 16, 'D3': 17, 'D4': 18, 'D5': 19,
        'E1': 20, 'E2': 21, 'E3': 22, 'E4': 23, 'E5': 24,
        'F1': 25, 'F2': 26, 'F3': 27, 'F4': 28, 'F5': 29,
        'G1': 30, 'G2': 31, 'G3': 32, 'G4': 33, 'G5': 34
    }
    df['sub_grade'] = df['sub_grade'].map(mapping_dict)

    # display(df['emp_title'].value_counts())
    df['emp_title'] = df['emp_title'].str.lower()
    df['emp_title'] = label_encoder.fit_transform(df['emp_title'])

    # display(df['emp_length'].value_counts())
    df['emp_length'] = df['emp_length'].str.replace(' years?', '', regex=True)
    mapping_dict = {'< 1': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10+': 10}
    df['emp_length'] = df['emp_length'].map(mapping_dict).fillna(0).astype(int)

    # display(df['home_ownership'].value_counts())
    mapping_dict = {'OWN': 0, 'RENT': 1, 'MORTGAGE': 2, 'ANY': 4, 'OTHER': 4, 'NONE': 4}
    df['home_ownership'] = df['home_ownership'].map(mapping_dict)

    # display(df['verification_status'].value_counts())
    mapping_dict = {'Verified': 0, 'Source Verified': 1, 'Not Verified': 2}
    df['verification_status'] = df['verification_status'].map(mapping_dict)

    # display(df['issue_d'].value_counts())
    df[['issue_m', 'issue_y']] = df['issue_d'].str.split('-', expand=True)
    mapping_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    df['issue_m'] = df['issue_m'].map(mapping_dict)
    df['issue_m'] = df['issue_m'].astype(int)
    df['issue_y'] = df['issue_y'].astype(int)
    df.drop(columns='issue_d', inplace=True)

    # display(df['pymnt_plan'].value_counts())
    mapping_dict = {'y': 0, 'n': 1}
    df['pymnt_plan'] = df['pymnt_plan'].map(mapping_dict)

    # display(df['purpose'].value_counts())
    df['purpose'] = label_encoder.fit_transform(df['purpose'])

    # display(df['title'].value_counts())
    df['title'] = df['title'].str.lower()
    df['title'] = label_encoder.fit_transform(df['title'])

    # display(df['zip_code'].value_counts())
    df['zip_code'] = df['zip_code'].str.lower()
    df['zip_code'] = label_encoder.fit_transform(df['zip_code'])

    # display(df['addr_state'].value_counts())
    df['addr_state'] = label_encoder.fit_transform(df['addr_state'])

    # display(df['earliest_cr_line'].value_counts())
    df[['earliest_cr_m', 'earliest_cr_y']] = df['earliest_cr_line'].str.split('-', expand=True)
    mapping_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    df['earliest_cr_m'] = df['earliest_cr_m'].map(mapping_dict)
    df['earliest_cr_m'] = df['earliest_cr_m'].astype(int)
    df['earliest_cr_y'] = df['earliest_cr_y'].astype(int)
    df.drop(columns='earliest_cr_line', inplace=True)

    # display(df['initial_list_status'].value_counts())
    mapping_dict = {'f': 0, 'w': 1}
    df['initial_list_status'] = df['initial_list_status'].map(mapping_dict)

    # display(df['last_pymnt_d'].value_counts())
    df[['last_pymnt_m', 'last_pymnt_y']] = df['last_pymnt_d'].str.split('-', expand=True)
    mapping_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    df['last_pymnt_m'] = df['last_pymnt_m'].map(mapping_dict)
    df['last_pymnt_m'] = df['last_pymnt_m'].fillna(99)
    df['last_pymnt_y'] = df['last_pymnt_y'].fillna('9999')
    df['last_pymnt_m'] = df['last_pymnt_m'].astype(int)
    df['last_pymnt_y'] = df['last_pymnt_y'].astype(int)
    df.drop(columns='last_pymnt_d', inplace=True)

    # display(df['last_credit_pull_d'].value_counts())
    df[['last_credit_pull_m', 'last_credit_pull_y']] = df['last_credit_pull_d'].str.split('-', expand=True)
    mapping_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    df['last_credit_pull_m'] = df['last_credit_pull_m'].map(mapping_dict)
    df['last_credit_pull_m'] = df['last_credit_pull_m'].fillna(0)
    df['last_credit_pull_y'] = df['last_credit_pull_y'].fillna('0000')
    df['last_credit_pull_m'] = df['last_credit_pull_m'].astype(int)
    df['last_credit_pull_y'] = df['last_credit_pull_y'].astype(int)
    df.drop(columns='last_credit_pull_d', inplace=True)

    # display(df['application_type'].value_counts())
    mapping_dict = {'Joint App': 0, 'Individual': 1}
    df['application_type'] = df['application_type'].map(mapping_dict)

    # display(df['hardship_flag'].value_counts())
    mapping_dict = {'N': 0, 'Y': 1}
    df['hardship_flag'] = df['hardship_flag'].map(mapping_dict)

    # display(df['disbursement_method'].value_counts())
    mapping_dict = {'DirectPay': 0, 'Cash': 1}
    df['disbursement_method'] = df['disbursement_method'].map(mapping_dict)

    # display(df['debt_settlement_flag'].value_counts())
    mapping_dict = {'Y': 0, 'N': 1}
    df['debt_settlement_flag'] = df['debt_settlement_flag'].map(mapping_dict)

    return df


encode_cat_cols(df_train)
encode_cat_cols(df_val)
encode_cat_cols(df_test)

Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'issue_d', 'pymnt_plan',
       'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line',
       'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d',
       'application_type', 'hardship_flag', 'disbursement_method',
       'debt_settlement_flag'],
      dtype='object')

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,settlement_percentage,settlement_term,issue_m,issue_y,earliest_cr_m,earliest_cr_y,last_pymnt_m,last_pymnt_y,last_credit_pull_m,last_credit_pull_y
0,,,8100,8100,8100.0,1,11.47,267.00,1,9,...,,,3,2016,3,2010,4,2016,5,2018
1,,,10000,10000,10000.0,1,12.99,336.90,2,11,...,,,5,2016,9,2005,9,2016,2,2017
2,,,9450,9450,9450.0,1,13.67,321.47,1,9,...,,,11,2013,8,1998,8,2014,5,2018
3,,,25000,25000,24975.0,1,17.49,897.43,3,19,...,,,6,2011,11,1990,6,2014,6,2014
4,,,16000,16000,16000.0,0,14.99,380.56,2,14,...,,,2,2015,6,2001,7,2017,1,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172591,,,7000,7000,7000.0,1,18.25,253.95,4,20,...,,,5,2015,4,2012,11,2016,2,2019
172592,,,12500,12500,12500.0,1,12.49,418.12,1,9,...,,,8,2014,9,2006,1,2016,1,2019
172593,,,30000,30000,30000.0,1,7.89,938.57,0,4,...,,,6,2015,1,1999,7,2018,11,2018
172594,,,3500,3500,3500.0,1,8.59,110.64,0,4,...,,,7,2016,10,2004,12,2017,2,2019


In [26]:
def train_xgboost_classifier(df, features, target):
    # Initialize XGBoost classifier
    xgb_classifier = XGBClassifier()

    # Train the classifier on the entire dataset
    xgb_classifier.fit(df[features], df[target])

    return xgb_classifier


def select_features(df, target, feature_importance_threshold=0.01):
    # Initialize XGBoost classifier for feature selection
    xgb_classifier = XGBClassifier()

    # Train the classifier to select features based on importance scores
    selector = SelectFromModel(xgb_classifier, threshold=feature_importance_threshold)
    selector.fit(df.drop(target, axis=1), df[target])

    # Get selected feature indices
    selected_feature_indices = selector.get_support(indices=True)

    # # Get selected feature names
    # selected_features = df.drop(target, axis=1).columns[selected_feature_indices]

    return selected_features


def evaluate_model(model, X, y):
    # Predict on the dataset
    y_pred = model.predict(X)

    # Evaluate the model
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy:", accuracy)

    return accuracy


def save_predictions(df, selected_features, model, output_file='test.csv'):
    # Subset the DataFrame with selected features
    X_selected = df[selected_features]

    # Predict on the dataset
    y_pred = model.predict(X_selected)

    # Create a new DataFrame with selected features and predicted target variable
    df_with_predictions = X_selected.copy()
    df_with_predictions['loan_status'] = y_pred

    # Rearrange columns to have 'loan_status' as the first column
    df_with_predictions = df_with_predictions[['loan_status'] + list(X_selected.columns)]

    # Save the DataFrame to a new CSV file
    df_with_predictions.to_csv(output_file, index=False)


_df_train = df_train.copy()
_df_val = df_val.copy()
_df_test = df_test.copy()

# Define the target variable
target = 'loan_status'

# Select features based on importance scores
selected_features = select_features(df_train, target)
display(selected_features)

# Train XGBoost classifier
model = train_xgboost_classifier(_df_train, selected_features, target)

# Evaluate the model on validation set
# evaluate_model(model, df_val[selected_features], df_val[target])
evaluate_model(model, _df_val[selected_features], _df_val[target])

# Save predictions on test set
save_predictions(_df_test, selected_features, model)

Index(['last_pymnt_amnt', 'funded_amnt', 'total_rec_prncp', 'recoveries'], dtype='object')

Accuracy: 0.999623397993001


In [24]:
# Calculate the permutation importances.
perm = PermutationImportance(model, random_state=1).fit(df_val[selected_features], df_val[target])
display(eli5.show_weights(perm, feature_names=selected_features.tolist()))

Weight,Feature
0.4649  ± 0.0017,total_rec_prncp
0.3443  ± 0.0015,funded_amnt
0.1413  ± 0.0007,recoveries
0.0014  ± 0.0001,last_pymnt_amnt
0.0000  ± 0.0000,term


In [23]:
df_train['term'].head()

0    1
1    1
2    1
3    1
4    1
Name: term, dtype: int64