In [36]:
librarian::shelf(tidyverse, tidymodels, DataExplorer, GGally, corrplot, plotly, viridis,
pROC, randomForest, factoextra, cluster, ggthemes, ggridges, scales, vcd, kableExtra, DT, 
MASS, Rtsne, tsne, umap, conflicted)

# 1. Introduction

This document presents an intensive exploratory loan_data analysis (EDA) and statistical analysis of a loan default loan_dataset. The goal is to uncover patterns, relationships, and insights that can help predict loan defaults. The analysis includes:

- loan_data preprocessing and cleaning
- Univariate analysis with advanced visualizations
- Bivariate and multivariate analysis
- Correlation analysis with sophisticated visualizations
- Feature importance and selection
- Statistical modeling and predictive analysis
- Advanced dimensionality reduction and visualization techniques

Let's begin by loading and examining the loan_dataset.

# 2. loan_data Loading and Initial Exploration

In [37]:
conflicted::conflicts_prefer(dplyr::select)
tidymodels_prefer()

[1m[22m[90m[conflicted][39m Removing existing preference.
[1m[22m[90m[conflicted][39m Will prefer [1m[34mdplyr[39m[22m::select over any other package.


In [38]:
loan_data <- read_csv("dataset/Loan_Default.csv")

[1mRows: [22m[34m148670[39m [1mColumns: [22m[34m34[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (21): loan_limit, Gender, approv_in_adv, loan_type, loan_purpose, Credit...
[32mdbl[39m (13): ID, year, loan_amount, rate_of_interest, Interest_rate_spread, Upf...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [39]:
cat("Dimensions of the loan_dataset", dim(loan_data)[1], "rows and ", dim(loan_data)[2], " columns")

Dimensions of the loan_dataset 148670 rows and  34  columns

In [40]:
head(loan_data)

[38;5;246m# A tibble: 6 × 34[39m
     ID  year loan_limit Gender            approv_in_adv loan_type loan_purpose
  [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<chr>[39m[23m      [3m[38;5;246m<chr>[39m[23m             [3m[38;5;246m<chr>[39m[23m         [3m[38;5;246m<chr>[39m[23m     [3m[38;5;246m<chr>[39m[23m       
[38;5;250m1[39m [4m2[24m[4m4[24m890  [4m2[24m019 cf         Sex Not Available nopre         type1     p1          
[38;5;250m2[39m [4m2[24m[4m4[24m891  [4m2[24m019 cf         Male              nopre         type2     p1          
[38;5;250m3[39m [4m2[24m[4m4[24m892  [4m2[24m019 cf         Male              pre           type1     p1          
[38;5;250m4[39m [4m2[24m[4m4[24m893  [4m2[24m019 cf         Male              nopre         type1     p4          
[38;5;250m5[39m [4m2[24m[4m4[24m894  [4m2[24m019 cf         Joint             pre           type1     p1          
[38;5;250m6[3

In [None]:
skimr::skim(loan_data) %>% 
  as.tibble() %>% 
  kable(caption = "Summary Statistics of the loan loan_data")

In [25]:
# Convert all character loan_amounts to factors

loan_data <- loan_data %>% mutate_if(is.character, as.factor)

In [34]:
loan_data %>% 
  janitor::clean_names() %>%
  select(-id) -> loan_data


## Missing loan_data Analysis


In [None]:
plot_missing(loan_data, missing_only = TRUE, title = "Missing Values Bar Chart") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(face = "bold"))

In [None]:
# Function to find mode
find_mode <- function(x) {
  uniq_x <- unique(na.omit(x))
  uniq_x[which.max(tabulate(match(na.omit(x), uniq_x)))]
}

# Impute missing values
loan_data_imputed <- loan_data %>%
  mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)),
         across(where(is.factor), ~as.factor(ifelse(is.na(.), as.character(find_mode(.)), as.character(.)))))

# Check if imputation was successful
any(is.na(loan_data_imputed))

# 3. Univariate Analysis

## 3.1 Distribution of Numeric loan_amounts

In [None]:
numeric_cols

In [None]:
library(gridExtra)
numeric_cols <- loan_data %>% 
  select_if(is.numeric) %>% 
  names()
advanced_density_plot <- function(data, variable) {
  p <- ggplot(data, aes_string(x = variable)) +
    geom_density(fill = "#3498db", alpha = 0.7) +
    geom_rug(alpha = 0.1, color = "#e74c3c") +
    stat_function(
      fun = function(x) dnorm(x, mean = mean(data[[variable]], na.rm = TRUE), 
                              sd = sd(data[[variable]], na.rm = TRUE)),
      color = "#e74c3c", linetype = "dashed", size = 1
    ) +
    labs(
      title = paste("Distribution of", variable),
      subtitle = paste(
        "Mean =", round(mean(data[[variable]], na.rm = TRUE), 2),
        "| Median =", round(median(data[[variable]], na.rm = TRUE), 2),
        "| SD =", round(sd(data[[variable]], na.rm = TRUE), 2)
      ),
      caption = paste(
        "Skewness =", round(moments::skewness(data[[variable]], na.rm = TRUE), 2),
        "| Kurtosis =", round(moments::kurtosis(data[[variable]], na.rm = TRUE), 2)
      )
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(face = "bold", hjust = 0.5),
      plot.subtitle = element_text(hjust = 0.5),
      plot.caption = element_text(hjust = 1)
    )
  
  return(p)
}

# Create density plots for the first 9 numeric variables
if(length(numeric_cols) > 0) {
  plot_list <- lapply(numeric_cols[1:min(9, length(numeric_cols))], 
                     function(col) advanced_density_plot(loan_data, col))
  do.call(grid.arrange, c(plot_list, ncol = 3))
}

In [None]:
if(length(numeric_cols) > 0) {
  # Create a function for advanced boxplots
  advanced_boxplot <- function(data, variable) {
    # Calculate outlier bounds
    q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
    q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
    iqr <- q3 - q1
    lower_bound <- q1 - 1.5 * iqr
    upper_bound <- q3 + 1.5 * iqr
    
    # Calculate percentage of outliers
    outliers <- data[[variable]][data[[variable]] < lower_bound | data[[variable]] > upper_bound]
    outlier_pct <- length(outliers) / length(na.omit(data[[variable]])) * 100
    
    p <- ggplot(data, aes_string(y = variable, x = 1)) +
      geom_violin(fill = "#9b59b6", alpha = 0.6) +
      geom_boxplot(width = 0.1, fill = "#3498db", alpha = 0.7, outlier.color = "#e74c3c") +
      labs(
        title = paste("Distribution of", variable),
        subtitle = paste(
          "IQR =", round(iqr, 2),
          "| Outliers =", round(outlier_pct, 2), "%"
        ),
        y = variable, x = ""
      ) +
      theme_minimal() +
      theme(
        plot.title = element_text(face = "bold", hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()
      )
    
    return(p)
  }
  
  # Create boxplots for the first 9 numeric variables
  plot_list <- lapply(numeric_cols[1:min(9, length(numeric_cols))], 
                     function(col) advanced_boxplot(loan_data, col))
  do.call(grid.arrange, c(plot_list, ncol = 3))
}