# LendingClub Credit Risk — EDA
**Dataset:** accepted_2007_to_2018q4 | **Rows:** ~2.26M

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

COLUMNS = [
    'loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
    'delinq_2yrs', 'fico_range_low', 'inq_last_6mths', 'open_acc',
    'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'mort_acc',
    'pub_rec_bankruptcies', 'grade', 'home_ownership', 'purpose',
    'emp_length', 'issue_d', 'loan_status'
]

df = pd.read_csv(
    '/kaggle/input/lending-club/accepted_2007_to_2018q4.csv',
    low_memory=False,
    usecols=lambda c: c in COLUMNS,
    skiprows=lambda i: i == 1
)
df = df.dropna(how='all').reset_index(drop=True)

TARGET_POSITIVE = [
    'Charged Off', 'Default', 'Late (31-120 days)',
    'Does not meet the credit policy. Status:Charged Off'
]
df['default'] = df['loan_status'].apply(lambda s: 1 if s in TARGET_POSITIVE else 0)
df['issue_date'] = pd.to_datetime(df['issue_d'], format='%b-%Y', errors='coerce')

print(f'Shape: {df.shape}')
print(f'Default rate: {df["default"].mean():.2%}')
df.head()

## 1. Class Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

counts = df['default'].value_counts()
axes[0].bar(['No Default', 'Default'], counts, color=['#3fb950', '#f85149'])
for i, v in enumerate(counts):
    axes[0].text(i, v + 5000, f'{v:,}\n({v/len(df):.1%})', ha='center')
axes[0].set_title('Class Distribution')
axes[0].set_ylabel('Count')

axes[1].pie(counts, labels=['No Default', 'Default'], colors=['#3fb950', '#f85149'],
            autopct='%1.1f%%', startangle=90)
axes[1].set_title('Default Split')

plt.tight_layout()
plt.show()

## 2. Missing Values

In [None]:
missing = (df.isnull().mean() * 100).sort_values(ascending=False)
missing = missing[missing > 0]

plt.figure(figsize=(8, 4))
plt.barh(missing.index, missing.values, color='#58a6ff')
plt.xlabel('% Missing')
plt.title('Missing Values by Column')
plt.tight_layout()
plt.show()

print(missing.to_string())

## 3. Feature Distributions — Default vs Non-Default

In [None]:
FEATURES = ['int_rate', 'fico_range_low', 'dti', 'annual_inc', 'revol_util', 'loan_amnt']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

d0 = df[df['default'] == 0]
d1 = df[df['default'] == 1]

for i, feat in enumerate(FEATURES):
    cap = df[feat].quantile(0.99)
    axes[i].hist(d0[feat].clip(upper=cap).dropna(), bins=50, alpha=0.55,
                 color='#3fb950', density=True, label='No Default')
    axes[i].hist(d1[feat].clip(upper=cap).dropna(), bins=50, alpha=0.55,
                 color='#f85149', density=True, label='Default')
    axes[i].axvline(d0[feat].mean(), color='#3fb950', linestyle='--', linewidth=1.5)
    axes[i].axvline(d1[feat].mean(), color='#f85149', linestyle='--', linewidth=1.5)
    axes[i].set_title(feat.replace('_', ' ').title())
    axes[i].legend(fontsize=8)

plt.suptitle('Feature Distributions: Default vs Non-Default', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Correlation with Target

In [None]:
num_cols = df[FEATURES + ['default']].select_dtypes(include=np.number)
corr = num_cols.corr()['default'].drop('default').sort_values()

plt.figure(figsize=(8, 5))
colors = ['#f85149' if v > 0 else '#3fb950' for v in corr.values]
plt.barh(corr.index, corr.values, color=colors)
plt.axvline(0, color='gray', linewidth=0.8)
plt.xlabel('Pearson Correlation with Default')
plt.title('Feature Correlation with Default')
plt.tight_layout()
plt.show()

## 5. Categorical Features

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Grade
grade_dr = df.groupby('grade')['default'].mean().sort_index()
axes[0].bar(grade_dr.index, grade_dr.values * 100,
            color=plt.cm.RdYlGn_r(np.linspace(0.1, 0.9, len(grade_dr))))
axes[0].set_title('Default Rate by Grade')
axes[0].set_ylabel('Default Rate (%)')

# Home ownership
home_dr = df[df['home_ownership'].isin(['RENT','OWN','MORTGAGE'])]\
            .groupby('home_ownership')['default'].mean().sort_values()
axes[1].bar(home_dr.index, home_dr.values * 100, color='#58a6ff')
axes[1].set_title('Default Rate by Home Ownership')
axes[1].set_ylabel('Default Rate (%)')

# Top purposes
top_purposes = df['purpose'].value_counts().nlargest(6).index
purp_dr = df[df['purpose'].isin(top_purposes)]\
            .groupby('purpose')['default'].mean().sort_values()
axes[2].barh(purp_dr.index, purp_dr.values * 100, color='#f0a84b')
axes[2].set_title('Default Rate by Purpose (Top 6)')
axes[2].set_xlabel('Default Rate (%)')

plt.suptitle('Categorical Feature Analysis', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Temporal Trends

In [None]:
df['year'] = df['issue_date'].dt.year
yearly = df.groupby('year').agg(
    count        = ('default', 'count'),
    default_rate = ('default', 'mean')
).reset_index()
yearly = yearly[yearly['year'].between(2007, 2018)]

fig, axes = plt.subplots(1, 2, figsize=(13, 4))

axes[0].bar(yearly['year'], yearly['count'] / 1000, color='#58a6ff')
axes[0].set_title('Loan Volume by Year')
axes[0].set_ylabel('Loans (thousands)')
axes[0].set_xlabel('Year')

axes[1].plot(yearly['year'], yearly['default_rate'] * 100,
             color='#f85149', linewidth=2.5, marker='o')
axes[1].fill_between(yearly['year'], yearly['default_rate'] * 100, alpha=0.2, color='#f85149')
axes[1].set_title('Default Rate by Year')
axes[1].set_ylabel('Default Rate (%)')
axes[1].set_xlabel('Year')

plt.suptitle('Temporal Trends (2007-2018)', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

cutoff = pd.Timestamp('2016-09-01')
print(f'Train default rate (< 2016): {df[df["issue_date"] < cutoff]["default"].mean():.2%}')
print(f'Test  default rate (>= 2016): {df[df["issue_date"] >= cutoff]["default"].mean():.2%}')
print('Lower test rate = outcome truncation (2017-18 loans not yet resolved)')