In [None]:
import warnings

# Suppress the Deprecation Warnings.
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Load in the necessary libraries.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython.display import display
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder
from xgboost import XGBClassifier

In [None]:
# Load in the dataset.
df_train = pd.read_csv('Resources/train.csv')
df_val = pd.read_csv('Resources/valid.csv')
df_test = pd.read_csv('Resources/X_test.csv')

In [None]:
# Get a general overview of the dataset.
display(df_train.head())
display(df_train.shape)

## Handling Missing Values


In [None]:
# Drop the columns where more than 50% of the data is missing.
df_train.dropna(axis='columns', inplace=True, thresh=len(df_train)/2)

print(df_train.shape)

## Detecting and Removing Outliers


In [None]:
display(df_train.select_dtypes(include=['int64', 'float64']).columns)

In [None]:
# The business context should govern how we define and react to outliers.
# The meanings of our findings should be dictated by the underlying context, rather than the number itself.

sns.boxplot(x=df_train['int_rate'])

## Feature Encoding


In [None]:
label_encoder = LabelEncoder()
ordinal_encoder = OrdinalEncoder()

# Display the categorical columns.
display(df_train.select_dtypes(include=['object']).columns)


def encode_cat_cols(df):
    df['term'] = df['term'].map({'60 months': 0, '36 months': 1})

    mapping_dict = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}
    df['grade'] = df['grade'].map(mapping_dict)

    mapping_dict = {
        'A1': 0, 'A2': 1, 'A3': 2, 'A4': 3, 'A5': 4,
        'B1': 5, 'B2': 6, 'B3': 7, 'B4': 8, 'B5': 9,
        'C1': 10, 'C2': 11, 'C3': 12, 'C4': 13, 'C5': 14,
        'D1': 15, 'D2': 16, 'D3': 17, 'D4': 18, 'D5': 19,
        'E1': 20, 'E2': 21, 'E3': 22, 'E4': 23, 'E5': 24,
        'F1': 25, 'F2': 26, 'F3': 27, 'F4': 28, 'F5': 29,
        'G1': 30, 'G2': 31, 'G3': 32, 'G4': 33, 'G5': 34
    }
    df['sub_grade'] = df['sub_grade'].map(mapping_dict)

    df['emp_title'] = df['emp_title'].str.lower()
    df['emp_title'] = label_encoder.fit_transform(df['emp_title'])

    df['emp_length'] = df['emp_length'].str.replace(' years?', '', regex=True)
    mapping_dict = {'< 1': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10+': 10}
    df['emp_length'] = df['emp_length'].map(mapping_dict).fillna(0).astype(int)

    mapping_dict = {'OWN': 0, 'RENT': 1, 'MORTGAGE': 2, 'ANY': 4, 'OTHER': 4, 'NONE': 4}
    df['home_ownership'] = df['home_ownership'].map(mapping_dict)

    mapping_dict = {'Verified': 0, 'Source Verified': 1, 'Not Verified': 2}
    df['verification_status'] = df['verification_status'].map(mapping_dict)

    df[['issue_m', 'issue_y']] = df['issue_d'].str.split('-', expand=True)
    mapping_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    df['issue_m'] = df['issue_m'].map(mapping_dict)
    df['issue_m'] = df['issue_m'].astype(int)
    df['issue_y'] = df['issue_y'].astype(int)
    df.drop(columns='issue_d', inplace=True)

    mapping_dict = {'y': 0, 'n': 1}
    df['pymnt_plan'] = df['pymnt_plan'].map(mapping_dict)

    df['purpose'] = label_encoder.fit_transform(df['purpose'])

    df['title'] = df['title'].str.lower()
    df['title'] = label_encoder.fit_transform(df['title'])

    df['zip_code'] = df['zip_code'].str.lower()
    df['zip_code'] = label_encoder.fit_transform(df['zip_code'])

    df['addr_state'] = label_encoder.fit_transform(df['addr_state'])

    df[['earliest_cr_m', 'earliest_cr_y']] = df['earliest_cr_line'].str.split('-', expand=True)
    mapping_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    df['earliest_cr_m'] = df['earliest_cr_m'].map(mapping_dict)
    df['earliest_cr_m'] = df['earliest_cr_m'].astype(int)
    df['earliest_cr_y'] = df['earliest_cr_y'].astype(int)
    df.drop(columns='earliest_cr_line', inplace=True)

    mapping_dict = {'f': 0, 'w': 1}
    df['initial_list_status'] = df['initial_list_status'].map(mapping_dict)

    df[['last_pymnt_m', 'last_pymnt_y']] = df['last_pymnt_d'].str.split('-', expand=True)
    mapping_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    df['last_pymnt_m'] = df['last_pymnt_m'].map(mapping_dict)
    df['last_pymnt_m'] = df['last_pymnt_m'].fillna(99)
    df['last_pymnt_y'] = df['last_pymnt_y'].fillna('9999')
    df['last_pymnt_m'] = df['last_pymnt_m'].astype(int)
    df['last_pymnt_y'] = df['last_pymnt_y'].astype(int)
    df.drop(columns='last_pymnt_d', inplace=True)

    df[['last_credit_pull_m', 'last_credit_pull_y']] = df['last_credit_pull_d'].str.split('-', expand=True)
    mapping_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    df['last_credit_pull_m'] = df['last_credit_pull_m'].map(mapping_dict)
    df['last_credit_pull_m'] = df['last_credit_pull_m'].fillna(0)
    df['last_credit_pull_y'] = df['last_credit_pull_y'].fillna('0000')
    df['last_credit_pull_m'] = df['last_credit_pull_m'].astype(int)
    df['last_credit_pull_y'] = df['last_credit_pull_y'].astype(int)
    df.drop(columns='last_credit_pull_d', inplace=True)

    mapping_dict = {'Joint App': 0, 'Individual': 1}
    df['application_type'] = df['application_type'].map(mapping_dict)

    mapping_dict = {'N': 0, 'Y': 1}
    df['hardship_flag'] = df['hardship_flag'].map(mapping_dict)

    mapping_dict = {'DirectPay': 0, 'Cash': 1}
    df['disbursement_method'] = df['disbursement_method'].map(mapping_dict)

    mapping_dict = {'Y': 0, 'N': 1}
    df['debt_settlement_flag'] = df['debt_settlement_flag'].map(mapping_dict)

    return df


encode_cat_cols(df_train)
encode_cat_cols(df_val)
encode_cat_cols(df_test)

## Feature Scaling


In [None]:
def scale_features(df):
    # Scale numerical columns using MinMaxScaler, excluding the target column if it exists.
    scaler = MinMaxScaler()

    numerical_columns = df.select_dtypes(include=['int', 'float']).columns
    if 'loan_status' in numerical_columns:
        numerical_columns = numerical_columns.drop('loan_status')
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


scale_features(df_train)
scale_features(df_val)
scale_features(df_test)

In [None]:
def train_xgboost_classifier(df_train, features, target):
    # Initialize XGBoost classifier
    xgb_classifier = XGBClassifier()

    # Train the classifier on the entire dataset
    xgb_classifier.fit(df_train[features], df_train[target])

    return xgb_classifier


def select_features(df_train, target, feature_importance_threshold=0.01):
    # Initialize XGBoost classifier for feature selection
    xgb_classifier = XGBClassifier()

    # Train the classifier to select features based on importance scores
    selector = SelectFromModel(xgb_classifier, threshold=feature_importance_threshold)
    selector.fit(df_train.drop(target, axis=1), df_train[target])

    # Get selected feature indices
    selected_feature_indices = selector.get_support(indices=True)

    # Get selected feature names
    selected_features = df_train.drop(target, axis=1).columns[selected_feature_indices]

    return selected_features


def evaluate_model(model, X, y):
    # Predict on the dataset
    y_pred = model.predict(X)

    # Evaluate the model
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy:", accuracy)

    return accuracy


def save_predictions(df, selected_features, model, output_file='test.csv'):
    # Subset the DataFrame with selected features
    X_selected = df[selected_features]

    # Predict on the dataset
    y_pred = model.predict(X_selected)

    # Create a new DataFrame with selected features and predicted target variable
    df_with_predictions = X_selected.copy()
    df_with_predictions['loan_status'] = y_pred

    # Rearrange columns to have 'loan_status' as the first column
    df_with_predictions = df_with_predictions[['loan_status'] + list(X_selected.columns)]

    # Save the DataFrame to a new CSV file
    df_with_predictions.to_csv(output_file, index=False)


# Define the target variable
target = 'loan_status'

# Select features based on importance scores
selected_features = select_features(df_train, target)
display(selected_features)

# Train XGBoost classifier
model = train_xgboost_classifier(df_train, selected_features), target)

# Evaluate the model on validation set
# evaluate_model(model, df_val[selected_features], df_val[target])
evaluate_model(model, df_val[selected_features], df_val[target])

# Save predictions on test set
save_predictions(df_test, selected_features, model)