# EDA

## Date: Nov 7, 2023

---------------

## Table of Contents

## Introduction

## Import Librarys

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

## Data Dictionary

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
#pathlib is used to ensure compatibility across operating systems
try:
    data_destination = Path('../Data/Lending_club/Lending Club Data Dictionary Approved.csv')
    dict_df = pd.read_csv(data_destination, encoding='ISO-8859-1')
    display(dict_df.iloc[:,0:2])
except FileNotFoundError as e:
    print(e.args[1])
    print('Check file location')

## Load the Data

In [None]:
# Define the relative path to the file
parquet_file_path = Path('../Data/Lending_club/Cleaned')

try:
    # Read the parquet file
    loans_df = pd.read_parquet(parquet_file_path)
except FileNotFoundError as e:
    print(e.args[1])
    print('Check file location')

In [None]:
loans_df.head()

In [None]:
loans_df['issue_d'].value_counts()

## Exploratory Data Analysis

In [None]:
# Separate the data between fully paid and charged off / defaulted loans
paid_loans = loans_df[loans_df['loan_status'] == "Fully Paid"]
defaulted_loans = loans_df[loans_df['loan_status'] == "Charged Off/Default"]

***Loan Status Imbalance***

We can explore the inabalance we have between our failed and successful loans.

In [None]:
# Get the proportion of failed vs successful loans 
loan_status_counts = loans_df['loan_status'].value_counts(normalize=True)

# Place a background grid
sns.set_style("whitegrid")

# Plot the Proportions
loan_status_counts.plot(kind='bar', color='skyblue')
plt.title('Proportion of loans by Status')
plt.xticks(rotation=45) 
plt.xlabel('Loan Status')
plt.ylabel('Proportion')

# Show the plot
plt.tight_layout()
plt.show()

We can see that we have large difference between our categories. This will need to be taken into consideration when we start creating the models.

***Loan Amount***

In [None]:
# Univariate Analysis
plt.figure(figsize=(10, 5))
sns.histplot(loans_df['loan_amnt'], bins=20, kde=True)
plt.title('Loan Amount Distribution')
plt.show()

We can see that majority of loans center around $10,000$, with a right tail to $40,000$, with the maximum coming in at $40,000$.   
This is due to LC limiting the amount to just '$'40,000.  
More Information can be found here:  
https://www.lendingclub.com/help/personal-loan-faq/how-much-can-i-borrow

In [None]:
# Select only the numeric columns for the correlation matrix
numeric_df = loans_df.select_dtypes(include=[np.number])

# Calculate the correlation matrix
corr = numeric_df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap='coolwarm', vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt=".2f")

plt.show()

***Debt to income vs Loan Status***

In [None]:
# DTI vs Loan Status
plt.figure(figsize=(10, 6))
sns.boxplot(x='loan_status', y='dti', data=loans_df)
plt.xticks(rotation=45)
plt.xlabel('Loan Status')
plt.ylabel('Debt to Income Ratio')
plt.title('Debt to Income for Failed and Successful Loans')

plt.show()

Looking at the boxplot, we can see that the median Dti is lower for the successful loans, with a lower IQR. Borrowers that have a lower Dti ratio are more likely to repay their loans.

In [None]:
loans_df['issue_d'] = pd.to_datetime(loans_df['issue_d'], format='%b-%Y')
loans_over_time = loans_df.groupby(loans_df['issue_d']).size()

plt.figure(figsize=(10,5))
loans_over_time.plot(title='Number of Loans Issued Over Time')
plt.xlabel('Issue Date')
plt.ylabel('Number of Loans')
plt.show()

In [None]:
# Convert 'issue_d' to datetime and 'int_rate' to a float if it's not already
loans_df['issue_d'] = pd.to_datetime(loans_df['issue_d'], format='%b-%Y')

# Group by issue date and count the number of loans
loans_count = loans_df.groupby(loans_df['issue_d']).size()

# Group by issue date and calculate the average interest rate
average_interest_rate = loans_df.groupby(loans_df['issue_d'])['int_rate'].mean()

# Set up the plot
fig, ax1 = plt.subplots(figsize=(10, 5))

# Plot number of loans on the left y-axis
ax1.set_xlabel('Issue Date')
ax1.set_ylabel('Number of Loans', color='tab:blue')
ax1.plot(loans_count.index, loans_count, color='tab:blue')
ax1.tick_params(axis='y', labelcolor='tab:blue')

# Create a twin Axes sharing the same x-axis
ax2 = ax1.twinx()

# Plot average interest rate on the right y-axis
ax2.set_ylabel('Average Interest Rate', color='tab:red')
ax2.plot(average_interest_rate.index, average_interest_rate, color='tab:red')
ax2.tick_params(axis='y', labelcolor='tab:red')

# Title and show
plt.title('Number of Loans and Average Interest Rate Over Time')
fig.tight_layout()
plt.show()


***Loan Amount and Loan Status Correlation***

In [None]:
# A hexbin is more appropriate due to the number of datapoints being plotted. The count of each hex is plotted on the right
plt.hexbin(paid_loans['funded_amnt'], paid_loans['int_rate'], gridsize=20, label='Fully Paid')
plt.colorbar()
plt.xlabel('Loan Amount')
plt.xticks(rotation=45) 
plt.ylabel('Interest Rate')
plt.title('Hexbin plot of Interest Rate vs Loan Amount')
plt.show()

Notice how there isn't much variation between late and "in grace period" loans, but there is between fully payed and defaulted / charged off loans. Charged off / defaulted loans have the highest median interest rate, with fully paid loans having one of the lowest. When considered with the hexplot, the majority of loans fall between $5,000 and $10,000, with an interest rate of approximately 12%, with the defaulted / charged off loans have a much higher interest rate, being further from the central grouping of data on the hex plot. 

In [None]:
sns.boxplot(data=loans_df, x='loan_status', y='int_rate')
plt.xticks(rotation=45) 
plt.title('Boxplot of Loan Amount by Loan Status')
plt.xlabel('Loan Status')
plt.ylabel('Interest Rate')
plt.show()

In [None]:
loans_df['purpose'].value_counts().plot(kind='barh')
plt.title('Purpose of Loans')
plt.xlabel('Frequency')
plt.ylabel('Purpose')
plt.xticks(rotation=45)
plt.show()

In [None]:
# You can still apply the filtering as mentioned before to remove outliers if necessary
percentile_95 = loans_df['annual_inc'].quantile(0.95)
filtered_loans_df = loans_df[loans_df['annual_inc'] <= percentile_95]

# Create the hexbin plot
plt.figure(figsize=(10, 6))
plt.hexbin(filtered_loans_df['annual_inc'], filtered_loans_df['loan_amnt'], gridsize=50, cmap='Blues')
plt.colorbar(label='Count in bin')
plt.title('Annual Income vs. Loan Amount (Hexbin)')
plt.xlabel('Annual Income')
plt.ylabel('Loan Amount')
plt.show()