In [36]:
import warnings

# Suppress the Deprecation Warnings.
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Load in the necessary libraries.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [37]:
# Load in the dataset.
df_train = pd.read_csv('Resources/train.csv')
df_val = pd.read_csv('Resources/valid.csv')

In [None]:
# Get a general overview of the dataset.
print(df_train.head(), end='\n\n')
print(df_train.shape)

- In data cleaning several tasks are needed to be done.

  1. **Handle missing values.**
  2. **Smoothen noisy data.**
  3. **Identify and remove outliers.**
  4. **Correct inconsistencies.**
  5. **Resolve redundancies caused by data integration.**


### Handling Missing Values


In [None]:
# Get the number of missing data points per column.
pd.set_option('display.max_rows', None)

missing_values_count = df_train.isnull().sum()
print(missing_values_count)

pd.set_option('display.max_rows', 10)

In [38]:
# Drop the columns where more than 50% of the data is missing.
df_train.dropna(axis='columns', inplace=True, thresh=len(df_train)/2)
# df_val.dropna(axis='columns', inplace=True, thresh=len(df_val)/2)

print(df_train.shape)

(517788, 87)


In [None]:
# Find out the categorical columns with missing data.
categorical_nan_columns = [col for col in df_train.columns if df_train[col].dtype == 'object' and df_train[col].isnull().any()]
print(categorical_nan_columns)

In [None]:
# Certain categorical columns can be automatically filled.
df_train.fillna(value={'emp_title': 'unemployed'}, inplace=True)
df_train.fillna(value={'emp_length': '< 1 year'}, inplace=True)
df_train.fillna(value={'title': 'not provided'}, inplace=True)
df_train.fillna(value={'zip_code': 'unknown'}, inplace=True)


df_val.fillna(value={'emp_title': 'unemployed'}, inplace=True)
df_val.fillna(value={'emp_length': '< 1 year'}, inplace=True)
df_val.fillna(value={'title': 'not provided'}, inplace=True)
df_val.fillna(value={'zip_code': 'unknown'}, inplace=True)

In [None]:
# Convert 'last_pymnt_d' to datetime format.
df_train['last_pymnt_d'] = pd.to_datetime(df_train['last_pymnt_d'], format='%b-%Y', errors='coerce')

# Extract year and month.
df_train['last_pymnt_year'] = df_train['last_pymnt_d'].dt.year
df_train['last_pymnt_month'] = df_train['last_pymnt_d'].dt.month

# Fill NaN values.
df_train['last_pymnt_year'].fillna(9999, inplace=True)
df_train['last_pymnt_month'].fillna(99, inplace=True)

# Convert year and month to int as they might be float due to NaNs.
df_train['last_pymnt_year'] = df_train['last_pymnt_year'].astype(int)
df_train['last_pymnt_month'] = df_train['last_pymnt_month'].astype(int)

# Drop the 'last_pymnt_d' column.
df_train.drop('last_pymnt_d', axis=1, inplace=True)


# Convert 'last_pymnt_d' to datetime format.
df_val['last_pymnt_d'] = pd.to_datetime(df_val['last_pymnt_d'], format='%b-%Y', errors='coerce')

# Extract year and month.
df_val['last_pymnt_year'] = df_val['last_pymnt_d'].dt.year
df_val['last_pymnt_month'] = df_val['last_pymnt_d'].dt.month

# Fill NaN values.
df_val['last_pymnt_year'].fillna(9999, inplace=True)
df_val['last_pymnt_month'].fillna(99, inplace=True)

# Convert year and month to int as they might be float due to NaNs.
df_val['last_pymnt_year'] = df_val['last_pymnt_year'].astype(int)
df_val['last_pymnt_month'] = df_val['last_pymnt_month'].astype(int)

# Drop the 'last_pymnt_d' column.
df_val.drop('last_pymnt_d', axis=1, inplace=True)

In [None]:
# Convert 'last_pymnt_d' to datetime format.
df_train['last_credit_pull_d'] = pd.to_datetime(df_train['last_credit_pull_d'], format='%b-%Y', errors='coerce')

# Extract year and month.
df_train['last_credit_pull_year'] = df_train['last_credit_pull_d'].dt.year
df_train['last_credit_pull_month'] = df_train['last_credit_pull_d'].dt.month

# Fill NaN values.
df_train['last_credit_pull_year'].fillna(0000, inplace=True)
df_train['last_credit_pull_month'].fillna(00, inplace=True)

# Convert year and month to int as they might be float due to NaNs.
df_train['last_credit_pull_year'] = df_train['last_credit_pull_year'].astype(int)
df_train['last_credit_pull_month'] = df_train['last_credit_pull_month'].astype(int)

# Drop the 'last_pymnt_d' column.
df_train.drop('last_credit_pull_d', axis=1, inplace=True)


# Convert 'last_pymnt_d' to datetime format.
df_val['last_credit_pull_d'] = pd.to_datetime(df_val['last_credit_pull_d'], format='%b-%Y', errors='coerce')

# Extract year and month.
df_val['last_credit_pull_year'] = df_val['last_credit_pull_d'].dt.year
df_val['last_credit_pull_month'] = df_val['last_credit_pull_d'].dt.month

# Fill NaN values.
df_val['last_credit_pull_year'].fillna(0000, inplace=True)
df_val['last_credit_pull_month'].fillna(00, inplace=True)

# Convert year and month to int as they might be float due to NaNs.
df_val['last_credit_pull_year'] = df_val['last_credit_pull_year'].astype(int)
df_val['last_credit_pull_month'] = df_val['last_credit_pull_month'].astype(int)

# Drop the 'last_pymnt_d' column.
df_val.drop('last_credit_pull_d', axis=1, inplace=True)

In [None]:
# For now drop all other rows with missing values.
df_train.dropna(axis='rows', inplace=True)


df_val.dropna(axis='rows', inplace=True)

## Feature Scaling

- If features are not Gaussian-like, say, has a skewed distribution or has outliers, Normalization - Standardization is not a good choice as it will compress most data to a narrow range.

- However, we can transform the feature into Gaussian like and then use Normalization - Standardization.

- When performing distance or covariance calculation (algorithm like Clustering, PCA and LDA), it is better to use Normalization - Standardization as it will remove the effect of scales on variance and covariance.

- Min-Max scaling has the same drawbacks as Normalization - Standardization, and also new data may not be bounded to [0,1] as they can be out of the original range. Some algorithms, for example some deep learning network prefer input on a 0-1 scale so this is a good choice.


In [None]:
plt.figure(figsize=(8, 6))

sns.kdeplot(df_train['loan_amnt'], color='skyblue', fill=True)

plt.xlabel('Loan Amount')
plt.ylabel('Frequency')

plt.show()

In [39]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# le = LabelEncoder()
# df['term'] = le.fit_transform(df['term'])
# Create a copy of the DataFrame to avoid modifying the original DataFrame
df_scaled = df_train.copy()

# Encode categorical columns using LabelEncoder
label_encoders = {}
for column in df_train.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df_scaled[column] = label_encoders[column].fit_transform(df_train[column])

# Scale numerical columns using MinMaxScaler
scaler = MinMaxScaler()
numerical_columns = df_train.select_dtypes(include=['int', 'float']).columns
df_scaled[numerical_columns] = scaler.fit_transform(df_train[numerical_columns])

# Display the scaled DataFrame
print(df_scaled.head())


# le = LabelEncoder()
# df['term'] = le.fit_transform(df['term'])
# Create a copy of the DataFrame to avoid modifying the original DataFrame
df_scaled_val = df_val.copy()

# Encode categorical columns using LabelEncoder
label_encoders = {}
for column in df_val.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df_scaled_val[column] = label_encoders[column].fit_transform(df_val[column])

# Scale numerical columns using MinMaxScaler
numerical_columns = df_val.select_dtypes(include=['int', 'float']).columns
df_scaled_val[numerical_columns] = scaler.fit_transform(df_val[numerical_columns])

# Display the scaled DataFrame
print(df_scaled_val.head())

   loan_amnt  funded_amnt  funded_amnt_inv  term  int_rate  installment  \
0   0.341772     0.341772          0.35000     0  0.079439     0.251156   
1   0.037975     0.037975          0.05000     0  0.427570     0.038398   
2   0.113924     0.113924          0.11875     0  0.080997     0.087899   
3   0.508861     0.508861          0.51500     0  0.260125     0.397073   
4   0.240506     0.240506          0.25000     0  0.080997     0.178680   

   grade  sub_grade  emp_title  emp_length  ...  pub_rec_bankruptcies  \
0      0          3      96610           0  ...              0.000000   
1      3         16     175168          11  ...              0.083333   
2      0          3     166043           4  ...              0.000000   
3      1          9     175168          11  ...              0.000000   
4      0          3     122003          10  ...              0.000000   

   tax_liens  tot_hi_cred_lim  total_bal_ex_mort  total_bc_limit  \
0   0.022222         0.006794           0.

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


   id  member_id  loan_amnt  funded_amnt  funded_amnt_inv  term  int_rate  \
0 NaN        NaN   0.356329     0.356329         0.364375     1  0.477414   
1 NaN        NaN   0.240506     0.240506         0.250000     0  0.322430   
2 NaN        NaN   0.417722     0.417722         0.425000     0  0.313084   
3 NaN        NaN   0.101266     0.101266         0.112500     0  0.179907   
4 NaN        NaN   0.145570     0.145570         0.156250     0  0.299065   

   installment  grade  sub_grade  ...  hardship_last_payment_amount  \
0     0.206440      3         16  ...                           NaN   
1     0.190590      2         11  ...                           NaN   
2     0.329469      2         11  ...                           NaN   
3     0.075939      1          6  ...                           NaN   
4     0.114504      1          8  ...                           NaN   

   disbursement_method  debt_settlement_flag  debt_settlement_flag_date  \
0                    0             

In [40]:
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier

# Define the XGBoost classifier
xgb_classifier = XGBClassifier()

# Perform feature selection with SelectFromModel
selector = SelectFromModel(xgb_classifier, threshold=-np.inf, max_features=10)  # Select top 10 features
selector.fit(df_scaled.drop('loan_status', axis=1), df_scaled['loan_status'])

# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)

# Get selected feature names
selected_feature_names = df_scaled.drop('loan_status', axis=1).columns[selected_feature_indices]

# Subset the DataFrame with selected features
X_selected = df_scaled[selected_feature_names]

# Now you can proceed with model training using X_selected and df_scaled['loan_status']

In [None]:
# Get the indices of selected features
selected_indices = selector.get_support()

# Get the names of selected features
selected_features = df_scaled.drop('loan_status', axis=1).columns[selected_indices]

# Print the names of selected features
print("Selected Features:")
print(selected_features)

In [41]:
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define the XGBoost classifier
xgb_classifier = XGBClassifier()

# Perform feature selection with SelectFromModel
selector = SelectFromModel(xgb_classifier, threshold=-np.inf, max_features=10)  # Select top 10 features
selector.fit(df_scaled.drop('loan_status', axis=1), df_scaled['loan_status'])

# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)

# Get selected feature names
selected_feature_names = df_scaled.drop('loan_status', axis=1).columns[selected_feature_indices]

# Subset the DataFrame with selected features for training
X_selected_train = df_scaled[selected_feature_names]
y_train = df_scaled['loan_status']

# Subset the DataFrame with selected features for validation
X_selected_val = df_scaled_val[selected_feature_names]
y_val = df_scaled_val['loan_status']

# Initialize XGBoost classifier with default hyperparameters
xgb_classifier = XGBClassifier()

# Train the classifier on the selected features using training data
xgb_classifier.fit(X_selected_train, y_train)

# Predict on the validation dataset
y_pred_val = xgb_classifier.predict(X_selected_val)

# Evaluate the classifier on the validation dataset
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

# Print classification report for validation dataset
print("Validation Classification Report:")
print(classification_report(y_val, y_pred_val))

Validation Accuracy: 0.9995828408537857
Validation Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     52331
         1.0       1.00      1.00      1.00    120265

    accuracy                           1.00    172596
   macro avg       1.00      1.00      1.00    172596
weighted avg       1.00      1.00      1.00    172596

