In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import re

import category_encoders as ce

# Set random seeds for reproducibility
np.random.seed(42)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print(" Libraries imported successfully")

 Libraries imported successfully


In [2]:
accepted_loan = pd.read_csv('C:/Users/ayan.pathak\Desktop/lending club loan data project/Data/accepted_2007_to_2018Q4.csv/accepted_2007_to_2018Q4.csv',low_memory=False)

print(f"Shape: {accepted_loan.shape}")
print(f"\nSample of data:")

Shape: (2260699, 151)

Sample of data:


In [50]:
print(accepted_loan.head())

   loan_amnt  funded_amnt        term   int_rate  installment grade sub_grade  \
0     3600.0       3600.0   36 months  13.990000   123.029999     C        C4   
1    24700.0      24700.0   36 months  11.990000   820.280029     C        C1   
2    20000.0      20000.0   60 months  10.780000   432.660004     B        B4   
3    35000.0      35000.0   60 months  14.850000   829.900024     C        C5   
4    10400.0      10400.0   60 months  22.450001   289.910004     F        F1   

  emp_length home_ownership  annual_inc  ... pub_rec_bankruptcies tax_liens  \
0  10+ years       MORTGAGE     55000.0  ...                  0.0       0.0   
1  10+ years       MORTGAGE     65000.0  ...                  0.0       0.0   
2  10+ years       MORTGAGE     63000.0  ...                  0.0       0.0   
3  10+ years       MORTGAGE    110000.0  ...                  0.0       0.0   
4    3 years       MORTGAGE    104433.0  ...                  0.0       0.0   

  tot_hi_cred_lim total_bal_ex_mort  t

In [51]:
print(f"Memory usage: {accepted_loan.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Memory usage: 733.02 MB


In [52]:
# Identify column types
float_cols = accepted_loan.select_dtypes(include=['float64']).columns.tolist()
object_cols = accepted_loan.select_dtypes(include=['object']).columns.tolist()

print(f"Float64 columns: {len(float_cols)}")
print(f"Object columns: {len(object_cols)}")

Float64 columns: 2
Object columns: 0


In [53]:
missing_summary = accepted_loan.isnull().sum()
high_missing = missing_summary[missing_summary > 0.5 * len(accepted_loan)]
print(f"Columns with >50% missing: {len(high_missing)}")
print("\nTop 10 columns with highest missing %:")
for col in high_missing.index[:10]:
    missing_pct = (accepted_loan[col].isnull().sum() / len(accepted_loan)) * 100
    print(f"{col}: {missing_pct:.1f}% missing")

Columns with >50% missing: 0

Top 10 columns with highest missing %:


In [7]:
def optimize_memory(df):
    """Aggressive memory optimization for large datasets"""
    initial_memory = df.memory_usage(deep=True).sum() / 1024**2
    
    # Downcast float64 to float32 (halves memory)
    float_cols = df.select_dtypes(include=['float64']).columns
    for col in float_cols:
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    # Downcast objects to categories where appropriate
    for col in df.select_dtypes(include=['object']).columns:
        num_unique = df[col].nunique()
        if num_unique < 0.1 * len(df):  # Low cardinality
            df[col] = df[col].astype('category')
    
    final_memory = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory reduced: {initial_memory:.2f}MB → {final_memory:.2f}MB ({((initial_memory-final_memory)/initial_memory)*100:.1f}%)")
    return df

accepted_loan = optimize_memory(accepted_loan)

Memory reduced: 6414.66MB → 1788.89MB (72.1%)


In [8]:
# Columns to drop immediately
immediate_drops = [
    'member_id',  # 100% missing
    'desc',       # 94.4% missing, text field
    'url',        # Unique URLs, not predictive
]
print("Preserving 'id' column for potential internal tracking")

# Check which exist and drop
existing_drops = [col for col in immediate_drops if col in accepted_loan.columns]
print(f"Dropping {len(existing_drops)} columns: {existing_drops}")
accepted_loan.drop(columns=existing_drops, inplace=True, errors='ignore')
print(f"New shape: {accepted_loan.shape}")

Preserving 'id' column for potential internal tracking
Dropping 3 columns: ['member_id', 'desc', 'url']
New shape: (2260699, 148)


In [9]:
# Identify columns that have direct counterparts in rejected dataset
potential_overlap = []
for col in accepted_loan.columns:
    col_lower = col.lower()
    # Look for feature name patterns that might match rejected loans
    if any(term in col_lower for term in ['loan_amnt', 'annual_inc', 'dti', 'zip', 'addr', 'emp', 'title', 'purpose']):
        potential_overlap.append(col)

print(f"\nPotential columns overlapping with rejected dataset: {len(potential_overlap)}")
print("Sample overlapping columns:", potential_overlap[:10])


Potential columns overlapping with rejected dataset: 11
Sample overlapping columns: ['loan_amnt', 'emp_title', 'emp_length', 'annual_inc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'annual_inc_joint']


In [10]:
date_columns = []
date_samples = {}

for col in accepted_loan.select_dtypes(include=['object', 'category']).columns:
    col_lower = col.lower()
    # Broader pattern matching for dates
    if any(pattern in col_lower for pattern in ['_d', 'date', 'issue', 'earliest', 'last', 'next', 'start', 'end']):
        non_null = accepted_loan[col].dropna()
        if not non_null.empty:
            sample = non_null.iloc[0]
            date_columns.append(col)
            date_samples[col] = sample

print(f"\nFound {len(date_columns)} potential date columns:")
for i, col in enumerate(date_columns[:15]):  # Show first 15
    print(f"{i+1}. {col}: Sample = '{date_samples[col]}'")


Found 11 potential date columns:
1. issue_d: Sample = 'Dec-2015'
2. earliest_cr_line: Sample = 'Aug-2003'
3. last_pymnt_d: Sample = 'Jan-2019'
4. next_pymnt_d: Sample = 'Apr-2019'
5. last_credit_pull_d: Sample = 'Mar-2019'
6. sec_app_earliest_cr_line: Sample = 'Feb-2005'
7. hardship_start_date: Sample = 'Sep-2017'
8. hardship_end_date: Sample = 'Dec-2017'
9. payment_plan_start_date: Sample = 'Oct-2017'
10. debt_settlement_flag_date: Sample = 'Nov-2017'
11. settlement_date: Sample = 'Sep-2017'


In [11]:
# Phase 1: Drop leakage columns
leakage_cols = [
    # Payment & performance
    'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
    'last_fico_range_high', 'last_fico_range_low',
    
    # Hardship
    'hardship_flag', 'hardship_type', 'hardship_reason', 'hardship_status',
    'deferral_term', 'hardship_amount', 'hardship_start_date',
    'hardship_end_date', 'payment_plan_start_date', 'hardship_length',
    'hardship_dpd', 'hardship_loan_status',
    'orig_projected_additional_accrued_interest',
    'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
    
    # Settlement
    'debt_settlement_flag', 'debt_settlement_flag_date', 'settlement_status',
    'settlement_date', 'settlement_amount', 'settlement_percentage', 'settlement_term',
    
    # Recovery
    'recoveries', 'collection_recovery_fee',
    
    # Total payments (reveal outcomes)
    'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee'
]

# Remove only those that exist
existing_leakage = [col for col in leakage_cols if col in accepted_loan.columns]
print(f"Dropping {len(existing_leakage)} leakage columns")
accepted_loan.drop(columns=existing_leakage, inplace=True, errors='ignore')

# Phase 2: Drop very high missing columns (>80% threshold)
missing_pct = accepted_loan.isnull().sum() / len(accepted_loan)
high_missing_cols = missing_pct[missing_pct > 0.80].index.tolist()

# But be CAREFUL - keep some important even if missing
# Remove from high_missing those we want to keep
keep_important = ['mths_since_last_delinq', 'mths_since_last_record']  # Important for credit history
high_missing_cols = [col for col in high_missing_cols if col not in keep_important]

print(f"\nDropping {len(high_missing_cols)} columns with >80% missing")
print("First 10 high-missing columns to drop:", high_missing_cols[:10])
accepted_loan.drop(columns=high_missing_cols, inplace=True, errors='ignore')

# Phase 3: Drop redundant/calculated columns
redundant = [
    'funded_amnt_inv', 'out_prncp', 'out_prncp_inv',
    'pymnt_plan',  # Likely constant
    'collections_12_mths_ex_med'
]
existing_redundant = [col for col in redundant if col in accepted_loan.columns]
print(f"\nDropping {len(existing_redundant)} redundant columns: {existing_redundant}")
accepted_loan.drop(columns=existing_redundant, inplace=True, errors='ignore')

print(f"\nShape after pruning: {accepted_loan.shape}")

Dropping 35 leakage columns

Dropping 16 columns with >80% missing
First 10 high-missing columns to drop: ['annual_inc_joint', 'dti_joint', 'verification_status_joint', 'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc']

Dropping 5 redundant columns: ['funded_amnt_inv', 'out_prncp', 'out_prncp_inv', 'pymnt_plan', 'collections_12_mths_ex_med']

Shape after pruning: (2260699, 92)


In [12]:
# Create is_joint_app flag BEFORE dropping joint columns
# Check if any joint application field has data
joint_indicator_cols = ['annual_inc_joint', 'dti_joint', 'verification_status_joint']
existing_joint_cols = [col for col in joint_indicator_cols if col in accepted_loan.columns]

if existing_joint_cols:
    # Create flag: 1 if ANY joint field is not null, else 0
    accepted_loan['is_joint_app'] = accepted_loan[existing_joint_cols[0]].notnull().astype(int)
    print(f"Joint applications: {accepted_loan['is_joint_app'].sum():,} ({accepted_loan['is_joint_app'].mean()*100:.1f}%)")
else:
    accepted_loan['is_joint_app'] = 0
    print("No joint application columns found, setting all to individual")

# Handle informative missingness for credit history columns
informative_missing_cols = [
    'mths_since_last_delinq',        # 51.2% missing = no recent delinquency
    'mths_since_last_major_derog',   # 74.3% missing = no major derogatory
    'mths_since_last_record',        # 84.1% missing = no public records
    'mths_since_recent_bc_dlq',      # Likely similar pattern
    'mths_since_recent_inq',         # Missing = no recent inquiries
    'mths_since_recent_revol_delinq' # Missing = no revolving delinquency
]

for col in informative_missing_cols:
    if col in accepted_loan.columns:
        missing_pct = accepted_loan[col].isnull().mean() * 100
        print(f"{col}: {missing_pct:.1f}% missing - imputing with large value")
        # Impute with 999 (or max+1) to indicate "never/very long ago"
        accepted_loan[col] = accepted_loan[col].fillna(999)

print(f"\nShape after informative missing handling: {accepted_loan.shape}")

No joint application columns found, setting all to individual
mths_since_last_delinq: 51.2% missing - imputing with large value
mths_since_last_major_derog: 74.3% missing - imputing with large value
mths_since_last_record: 84.1% missing - imputing with large value
mths_since_recent_bc_dlq: 77.0% missing - imputing with large value
mths_since_recent_inq: 13.1% missing - imputing with large value
mths_since_recent_revol_delinq: 67.3% missing - imputing with large value

Shape after informative missing handling: (2260699, 93)


In [13]:
def convert_mon_year_to_datetime(series):
    "Convert 'MMM-YYYY' format to datetime"
    try:
        return pd.to_datetime(series, format='%b-%Y')
    except:
        # Try other common formats
        return pd.to_datetime(series, errors='coerce')

# Date columns to process (application phase only - NO LEAKAGE)
application_dates = ['issue_d', 'earliest_cr_line']

date_conversions = {}
for date_col in application_dates:
    if date_col in accepted_loan.columns:
        print(f"\nProcessing {date_col}:")
        original_sample = accepted_loan[date_col].dropna().iloc[0] if not accepted_loan[date_col].dropna().empty else 'N/A'
        print(f"  Original format sample: '{original_sample}'")
        
        # Convert to datetime
        accepted_loan[f'{date_col}_dt'] = convert_mon_year_to_datetime(accepted_loan[date_col])
        
        # Create useful features
        if date_col == 'earliest_cr_line':
            # Calculate credit history length in months
            if 'issue_d_dt' in accepted_loan.columns:
                accepted_loan['credit_history_months'] = (
                    (accepted_loan['issue_d_dt'] - accepted_loan[f'{date_col}_dt']) 
                    / np.timedelta64(1, 'D')
                    / 30.4375
                ).round()
                print(f"  Created 'credit_history_months' feature")
        
        # Drop original string column
        accepted_loan.drop(columns=[date_col], inplace=True)
        
        # Report conversion success
        null_pct = accepted_loan[f'{date_col}_dt'].isnull().mean() * 100
        print(f"  Converted, {null_pct:.1f}% null after conversion")
        date_conversions[date_col] = null_pct

# Show date ranges
if 'issue_d_dt' in accepted_loan.columns:
    print(f"\nApplication Date Range:")
    print(f"  Start: {accepted_loan['issue_d_dt'].min()}")
    print(f"  End: {accepted_loan['issue_d_dt'].max()}")
    print(f"  Time span: {(accepted_loan['issue_d_dt'].max() - accepted_loan['issue_d_dt'].min()).days / 365:.1f} years")

print(f"\nShape after date processing: {accepted_loan.shape}")


Processing issue_d:
  Original format sample: 'Dec-2015'
  Converted, 0.0% null after conversion

Processing earliest_cr_line:
  Original format sample: 'Aug-2003'
  Created 'credit_history_months' feature
  Converted, 0.0% null after conversion

Application Date Range:
  Start: 2007-06-01 00:00:00
  End: 2018-12-01 00:00:00
  Time span: 11.5 years

Shape after date processing: (2260699, 94)


In [15]:
# Analyze remaining missing values
missing_analysis = accepted_loan.isnull().sum()
remaining_missing = missing_analysis[missing_analysis > 0]

print(f"Columns with any missing values: {len(remaining_missing)}")
print("\nMissing value summary (sorted by % missing):")
for col in remaining_missing.index:
    missing_pct = (accepted_loan[col].isnull().sum() / len(accepted_loan)) * 100
    dtype = accepted_loan[col].dtype
    if missing_pct > 5:  # Show only >5% missing
        print(f"{col:30} {str(dtype):10} {missing_pct:6.1f}% missing")

# Calculate overall missingness
total_cells = accepted_loan.shape[0] * accepted_loan.shape[1]
missing_cells = accepted_loan.isnull().sum().sum()
overall_missing_pct = (missing_cells / total_cells) * 100
print(f"\nOverall missingness: {overall_missing_pct:.2f}%")
print(f"Missing cells: {missing_cells:,} / {total_cells:,}")

Columns with any missing values: 86

Missing value summary (sorted by % missing):
emp_title                      object        7.4% missing
emp_length                     category      6.5% missing
open_acc_6m                    float32      38.3% missing
open_act_il                    float32      38.3% missing
open_il_12m                    float32      38.3% missing
open_il_24m                    float32      38.3% missing
mths_since_rcnt_il             float32      40.3% missing
total_bal_il                   float32      38.3% missing
il_util                        float32      47.3% missing
open_rv_12m                    float32      38.3% missing
open_rv_24m                    float32      38.3% missing
max_bal_bc                     float32      38.3% missing
all_util                       float32      38.3% missing
inq_fi                         float32      38.3% missing
total_cu_tl                    float32      38.3% missing
inq_last_12m                   float32      38.3

In [18]:
# Drop the unnecessary joint app flag
if 'is_joint_app' in accepted_loan.columns:
    if accepted_loan['is_joint_app'].sum() == 0:
        accepted_loan.drop(columns=['is_joint_app'], inplace=True)
        print("Dropped 'is_joint_app' column.")

In [None]:
# 1. Handle the 38.3% missing pattern columns (likely introduced later)
print("\nHandling features with temporal introduction pattern (38.3% missing):")
temporal_pattern_cols = [
    'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
    'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m',
    'max_bal_bc', 'all_util', 'inq_fi', 'total_cu_tl', 'inq_last_12m'
]

# Check which exist
existing_temporal = [col for col in temporal_pattern_cols if col in accepted_loan.columns]
print(f"Found {len(existing_temporal)} temporal pattern columns")

# For these, missing likely means "not collected yet" = pre-2014 loans
# Impute with 0 (assuming no activity/accounts for these features pre-introduction)
for col in existing_temporal:
    if accepted_loan[col].dtype in ['float32', 'float64', 'int32', 'int64']:
        accepted_loan[col] = accepted_loan[col].fillna(0)
        print(f"  {col}: Imputed {accepted_loan[col].isnull().mean()*100:.1f}% missing with 0")


1. Handling features with temporal introduction pattern (38.3% missing):
Found 13 temporal pattern columns
  open_acc_6m: Imputed 0.0% missing with 0
  open_act_il: Imputed 0.0% missing with 0
  open_il_12m: Imputed 0.0% missing with 0
  open_il_24m: Imputed 0.0% missing with 0
  total_bal_il: Imputed 0.0% missing with 0
  il_util: Imputed 0.0% missing with 0
  open_rv_12m: Imputed 0.0% missing with 0
  open_rv_24m: Imputed 0.0% missing with 0
  max_bal_bc: Imputed 0.0% missing with 0
  all_util: Imputed 0.0% missing with 0
  inq_fi: Imputed 0.0% missing with 0
  total_cu_tl: Imputed 0.0% missing with 0
  inq_last_12m: Imputed 0.0% missing with 0


In [None]:
if 'mths_since_rcnt_il' in accepted_loan.columns:
    # Missing means "no recent installment loan" - impute with large value
    accepted_loan['mths_since_rcnt_il'] = accepted_loan['mths_since_rcnt_il'].fillna(999)
    print(f"\nmths_since_rcnt_il: Imputed with 999 (no recent installment loan)")


2. mths_since_rcnt_il: Imputed with 999 (no recent installment loan)


In [21]:
# 3. Handle emp_title and emp_length (text/category columns)
print("\nHandling employment information:")
if 'emp_title' in accepted_loan.columns:
    accepted_loan['emp_title'] = accepted_loan['emp_title'].fillna('Not Provided')
    print(f"  emp_title: Imputed missing as 'Not Provided'")

if 'emp_length' in accepted_loan.columns:
    # Check if it's category or object
    if accepted_loan['emp_length'].dtype.name == 'category':
        accepted_loan['emp_length'] = accepted_loan['emp_length'].cat.add_categories(['Not Provided'])
    accepted_loan['emp_length'] = accepted_loan['emp_length'].fillna('Not Provided')
    print(f"  emp_length: Imputed missing as 'Not Provided'")


Handling employment information:
  emp_title: Imputed missing as 'Not Provided'
  emp_length: Imputed missing as 'Not Provided'


In [22]:
print("\nHandling remaining low missingness columns:")
low_missing_cols = ['mo_sin_old_il_acct', 'num_tl_120dpd_2m']
for col in low_missing_cols:
    if col in accepted_loan.columns:
        if accepted_loan[col].dtype in ['float32', 'float64', 'int32', 'int64']:
            # Median imputation
            median_val = accepted_loan[col].median()
            accepted_loan[col] = accepted_loan[col].fillna(median_val)
            print(f"  {col}: Imputed {accepted_loan[col].isnull().mean()*100:.1f}% missing with median {median_val:.2f}")

print(f"\nShape after strategic imputation: {accepted_loan.shape}")


Handling remaining low missingness columns:
  mo_sin_old_il_acct: Imputed 0.0% missing with median 130.00
  num_tl_120dpd_2m: Imputed 0.0% missing with median 0.00

Shape after strategic imputation: (2260699, 93)


In [23]:
# Check if any missing values remain
remaining_missing = accepted_loan.isnull().sum().sum()
if remaining_missing == 0:
    print("No missing values remaining!")
else:
    print(f"{remaining_missing} missing values still exist")
    missing_cols = accepted_loan.isnull().sum()
    missing_cols = missing_cols[missing_cols > 0]
    print("\nColumns with remaining missing values:")
    for col, count in missing_cols.items():
        print(f"  {col}: {count} missing ({count/len(accepted_loan)*100:.2f}%)")

2123950 missing values still exist

Columns with remaining missing values:
  loan_amnt: 31 missing (0.00%)
  funded_amnt: 31 missing (0.00%)
  term: 31 missing (0.00%)
  int_rate: 31 missing (0.00%)
  installment: 31 missing (0.00%)
  grade: 31 missing (0.00%)
  sub_grade: 31 missing (0.00%)
  home_ownership: 31 missing (0.00%)
  annual_inc: 35 missing (0.00%)
  verification_status: 31 missing (0.00%)
  loan_status: 31 missing (0.00%)
  purpose: 31 missing (0.00%)
  title: 23357 missing (1.03%)
  zip_code: 32 missing (0.00%)
  addr_state: 31 missing (0.00%)
  dti: 1742 missing (0.08%)
  delinq_2yrs: 60 missing (0.00%)
  fico_range_low: 31 missing (0.00%)
  fico_range_high: 31 missing (0.00%)
  inq_last_6mths: 61 missing (0.00%)
  open_acc: 60 missing (0.00%)
  pub_rec: 60 missing (0.00%)
  revol_bal: 31 missing (0.00%)
  revol_util: 1833 missing (0.08%)
  total_acc: 60 missing (0.00%)
  initial_list_status: 31 missing (0.00%)
  policy_code: 31 missing (0.00%)
  application_type: 31 mis

In [24]:
# Analyze categorical columns for high cardinality
categorical_cols = accepted_loan.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Found {len(categorical_cols)} categorical columns:")
for col in categorical_cols:
    unique_count = accepted_loan[col].nunique()
    col_type = accepted_loan[col].dtype
    if unique_count < 20:  # Show distribution for low-cardinality
        print(f"\n{col} ({col_type}, {unique_count} unique):")
        value_counts = accepted_loan[col].value_counts(normalize=True).head(5)
        for val, pct in value_counts.items():
            print(f"  {val}: {pct*100:.1f}%")
    else:
        print(f"{col}: {col_type}, {unique_count} unique values")

Found 16 categorical columns:
id: object, 2260699 unique values

term (category, 2 unique):
   36 months: 71.2%
   60 months: 28.8%

grade (category, 7 unique):
  B: 29.4%
  C: 28.8%
  A: 19.2%
  D: 14.4%
  E: 6.0%
sub_grade: category, 35 unique values
emp_title: object, 512695 unique values

emp_length (category, 12 unique):
  10+ years: 33.1%
  2 years: 9.0%
  < 1 year: 8.4%
  3 years: 8.0%
  1 year: 6.6%

home_ownership (category, 6 unique):
  MORTGAGE: 49.2%
  RENT: 39.6%
  OWN: 11.2%
  ANY: 0.0%
  OTHER: 0.0%

verification_status (category, 3 unique):
  Source Verified: 39.2%
  Not Verified: 32.9%
  Verified: 27.9%

loan_status (category, 9 unique):
  Fully Paid: 47.6%
  Current: 38.9%
  Charged Off: 11.9%
  Late (31-120 days): 0.9%
  In Grace Period: 0.4%

purpose (category, 14 unique):
  debt_consolidation: 56.5%
  credit_card: 22.9%
  home_improvement: 6.7%
  other: 6.2%
  major_purchase: 2.2%
title: category, 63154 unique values
zip_code: category, 956 unique values
addr_state

In [26]:
# Check for constant columns
constant_cols = []
for col in accepted_loan.columns:
    if accepted_loan[col].nunique() == 1:
        constant_cols.append(col)

if constant_cols:
    print(f"Found {len(constant_cols)} constant columns: {constant_cols}")
else:
    print("No constant columns found")

# Drop constant column
if 'policy_code' in accepted_loan.columns:
    accepted_loan.drop(columns=['policy_code'], inplace=True)
    print(f"Dropped constant column: policy_code")

Found 1 constant columns: ['policy_code']
Dropped constant column: policy_code


In [27]:
high_cardinality_drops = ['emp_title', 'title', 'id', 'zip_code']  # zip_code has 956 unique, too many
existing_high_card = [col for col in high_cardinality_drops if col in accepted_loan.columns]
accepted_loan.drop(columns=existing_high_card, inplace=True, errors='ignore')
print(f"Dropped high-cardinality columns: {existing_high_card}")

Dropped high-cardinality columns: ['emp_title', 'title', 'id', 'zip_code']


In [29]:
# Handle the 3.1% missing pattern (likely same records)
print("\nIdentifying records with multiple missing values...")
# Count missing per row
missing_per_row = accepted_loan.isnull().sum(axis=1)
high_missing_rows = missing_per_row[missing_per_row > 5]  # Rows missing >5 values
print(f"Rows missing >5 values: {len(high_missing_rows)} ({len(high_missing_rows)/len(accepted_loan)*100:.2f}%)")
print(f"Max missing per row: {missing_per_row.max()}")


Identifying records with multiple missing values...
Rows missing >5 values: 70375 (3.11%)
Max missing per row: 65


In [30]:
# Group columns by type for appropriate imputation
float_cols = accepted_loan.select_dtypes(include=['float32', 'float64']).columns.tolist()
cat_cols = accepted_loan.select_dtypes(include=['object', 'category']).columns.tolist()

# For float columns: median imputation
for col in float_cols:
    if accepted_loan[col].isnull().any():
        missing_pct = accepted_loan[col].isnull().mean() * 100
        if missing_pct < 5:  # Only for low missing
            median_val = accepted_loan[col].median()
            accepted_loan[col] = accepted_loan[col].fillna(median_val)
            print(f"  {col}: {missing_pct:.2f}% → median {median_val:.2f}")

# For categorical columns: mode imputation
for col in cat_cols:
    if accepted_loan[col].isnull().any():
        missing_pct = accepted_loan[col].isnull().mean() * 100
        if missing_pct < 5:
            mode_val = accepted_loan[col].mode()[0] if not accepted_loan[col].mode().empty else 'Unknown'
            accepted_loan[col] = accepted_loan[col].fillna(mode_val)
            print(f"  {col}: {missing_pct:.2f}% → mode '{mode_val}'")

  loan_amnt: 0.00% → median 12900.00
  funded_amnt: 0.00% → median 12875.00
  int_rate: 0.00% → median 12.62
  installment: 0.00% → median 377.99
  annual_inc: 0.00% → median 65000.00
  dti: 0.08% → median 17.84
  delinq_2yrs: 0.00% → median 0.00
  fico_range_low: 0.00% → median 690.00
  fico_range_high: 0.00% → median 694.00
  inq_last_6mths: 0.00% → median 0.00
  open_acc: 0.00% → median 11.00
  pub_rec: 0.00% → median 0.00
  revol_bal: 0.00% → median 11324.00
  revol_util: 0.08% → median 50.30
  total_acc: 0.00% → median 22.00
  acc_now_delinq: 0.00% → median 0.00
  tot_coll_amt: 3.11% → median 0.00
  tot_cur_bal: 3.11% → median 79240.00
  total_rev_hi_lim: 3.11% → median 25400.00
  acc_open_past_24mths: 2.21% → median 4.00
  avg_cur_bal: 3.11% → median 7335.00
  bc_open_to_buy: 3.32% → median 5442.00
  bc_util: 3.37% → median 60.20
  chargeoff_within_12_mths: 0.01% → median 0.00
  delinq_amnt: 0.00% → median 0.00
  mo_sin_old_rev_tl_op: 3.11% → median 164.00
  mo_sin_rcnt_rev_tl_op

In [32]:
# 5. Check remaining missing
remaining_missing = accepted_loan.isnull().sum().sum()
print(f"\nRemaining missing values after imputation: {remaining_missing}")
if remaining_missing > 0:
    # For any remaining missing, drop those rows (very few)
    initial_shape = accepted_loan.shape
    accepted_loan = accepted_loan.dropna()
    print(f"Dropped {initial_shape[0] - accepted_loan.shape[0]} rows with missing values")

print(f"\nFinal shape: {accepted_loan.shape}")
print(f"Total missing values: {accepted_loan.isnull().sum().sum()}")


Remaining missing values after imputation: 91
Dropped 60 rows with missing values

Final shape: (2260639, 88)
Total missing values: 0


In [37]:
cat_cols = accepted_loan.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns to encode: {len(cat_cols)}")

Categorical columns to encode: 12


In [38]:
encoding_plan = {}

for col in cat_cols:
    unique_count = accepted_loan[col].nunique()
    value_dist = accepted_loan[col].value_counts(normalize=True)
    
    print(f"\n{col}:")
    print(f"  Unique values: {unique_count}")
    print(f"  Top 3 values:")
    for val, pct in value_dist.head(3).items():
        print(f"    '{val}': {pct*100:.1f}%")
    
    # Determine encoding strategy
    if unique_count == 2:
        encoding_plan[col] = 'binary'
        print(f"  Strategy: Binary encoding (0/1)")
    elif unique_count <= 10:
        encoding_plan[col] = 'one-hot'
        print(f"  Strategy: One-hot encoding")
    elif unique_count <= 50:
        encoding_plan[col] = 'target_encode'
        print(f"  Strategy: Target encoding (with regularization)")
    else:
        encoding_plan[col] = 'frequency'
        print(f"  Strategy: Frequency encoding")


term:
  Unique values: 2
  Top 3 values:
    ' 36 months': 71.2%
    ' 60 months': 28.8%
  Strategy: Binary encoding (0/1)

grade:
  Unique values: 7
  Top 3 values:
    'B': 29.4%
    'C': 28.8%
    'A': 19.2%
  Strategy: One-hot encoding

sub_grade:
  Unique values: 35
  Top 3 values:
    'C1': 6.5%
    'B5': 6.2%
    'B4': 6.2%
  Strategy: Target encoding (with regularization)

emp_length:
  Unique values: 12
  Top 3 values:
    '10+ years': 33.1%
    '2 years': 9.0%
    '< 1 year': 8.4%
  Strategy: Target encoding (with regularization)

home_ownership:
  Unique values: 6
  Top 3 values:
    'MORTGAGE': 49.2%
    'RENT': 39.6%
    'OWN': 11.2%
  Strategy: One-hot encoding

verification_status:
  Unique values: 3
  Top 3 values:
    'Source Verified': 39.2%
    'Not Verified': 32.9%
    'Verified': 27.9%
  Strategy: One-hot encoding

loan_status:
  Unique values: 9
  Top 3 values:
    'Fully Paid': 47.6%
    'Current': 38.9%
    'Charged Off': 11.9%
  Strategy: One-hot encoding

pur

In [39]:
print("\nDatetime columns:")
dt_cols = [col for col in accepted_loan.columns if '_dt' in col]
for col in dt_cols:
    print(f"  {col}: Keep as datetime for temporal features")


Datetime columns:
  issue_d_dt: Keep as datetime for temporal features
  earliest_cr_line_dt: Keep as datetime for temporal features


In [40]:
exclude_from_modeling = ['issue_d_dt', 'earliest_cr_line_dt', 'loan_status']  # loan_status for phase 2

model_features = [col for col in accepted_loan.columns if col not in exclude_from_modeling]
print(f"Total features for modeling: {len(model_features)}")
print(f"Features to encode: {[col for col in model_features if col in cat_cols]}")

Total features for modeling: 85
Features to encode: ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'addr_state', 'initial_list_status', 'application_type', 'disbursement_method']


In [43]:
# Save the preprocessed data for encoding phase
accepted_loan.to_parquet('accepted_loan_preprocessed.parquet', index=False, engine='fastparquet')
print("Saved to 'accepted_loan_preprocessed.parquet'")

Saved to 'accepted_loan_preprocessed.parquet'


In [45]:
# 1. For tree-based models (Isolation Forest): Can handle some categorical encoding
print("1. Preparing data for tree-based models...")

# Create a copy for tree-based models
tree_data = accepted_loan.copy()

# Binary encode 2-value columns
binary_cols = [col for col in encoding_plan.keys() if encoding_plan[col] == 'binary']
for col in binary_cols:
    if col in tree_data.columns:
        le = LabelEncoder()
        tree_data[col] = le.fit_transform(tree_data[col].astype(str))
        print(f"  Binary encoded: {col} → {le.classes_}")

# Ordinal encode emp_length (has natural order)
if 'emp_length' in tree_data.columns:
    emp_length_order = ['< 1 year', '1 year', '2 years', '3 years', '4 years', 
                        '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']
    # Map to ordered categories
    tree_data['emp_length'] = pd.Categorical(tree_data['emp_length'], 
                                             categories=emp_length_order, 
                                             ordered=True)
    tree_data['emp_length_encoded'] = tree_data['emp_length'].cat.codes
    tree_data.drop(columns=['emp_length'], inplace=True)
    print(f"  Ordinal encoded: emp_length")

print(f"Tree-data shape: {tree_data.shape}")

1. Preparing data for tree-based models...
  Binary encoded: term → [' 36 months' ' 60 months']
  Binary encoded: initial_list_status → ['f' 'w']
  Binary encoded: application_type → ['Individual' 'Joint App']
  Binary encoded: disbursement_method → ['Cash' 'DirectPay']
  Ordinal encoded: emp_length
Tree-data shape: (2260639, 88)


In [47]:
# For distance-based models (LOF, SVM): Need proper encoding
print("\nPreparing data for distance-based models...")

# Select features for distance-based models
distance_features = model_features.copy()
# Remove high-cardinality categoricals for distance models
high_card_remove = ['sub_grade', 'addr_state']  # Too many categories
distance_features = [f for f in distance_features if f not in high_card_remove]

# Create dataset with limited categoricals
distance_data = accepted_loan[distance_features].copy()

# One-hot encode low-cardinality categoricals
low_card_cats = [col for col in distance_features 
                 if col in encoding_plan.keys() 
                 and encoding_plan[col] == 'one-hot'
                 and col not in ['sub_grade', 'addr_state']]

print(f"One-hot encoding {len(low_card_cats)} columns: {low_card_cats}")

# We'll do one-hot encoding separately in modeling phase to avoid memory issues
print("Note: One-hot encoding will be done during model training to avoid memory blow-up")

# Save both versions
tree_data.to_parquet('accepted_loan_tree_ready.parquet', index=False, engine='fastparquet')
distance_data.to_parquet('accepted_loan_distance_ready.parquet', index=False, engine='fastparquet')

print(f"\nSaved tree-ready data: {tree_data.shape}")
print(f"Saved distance-ready data: {distance_data.shape}")
print("\nPREPROCESSING COMPLETE!")


Preparing data for distance-based models...
One-hot encoding 3 columns: ['grade', 'home_ownership', 'verification_status']
Note: One-hot encoding will be done during model training to avoid memory blow-up

Saved tree-ready data: (2260639, 88)
Saved distance-ready data: (2260639, 83)

PREPROCESSING COMPLETE!
