# Feature Engineering
**Team**: Renaissance Technologies  
**Purpose**: Create strategic features for loan default prediction model

In [25]:
# 1. Load Dataset
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('main_dataset.csv')

print(f"Dataset loaded: {len(df):,} rows, {len(df.columns)} columns")
print(f"\nAvailable columns:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

Dataset loaded: 89,999 rows, 64 columns

Available columns:
   1. customer_id
   2. application_id
   3. application_hour
   4. application_day_of_week
   5. account_open_year
   6. preferred_contact
   7. referral_code
   8. account_status_code
   9. random_noise_1
  10. num_login_sessions
  11. num_customer_service_calls
  12. has_mobile_app
  13. paperless_billing
  14. default
  15. age
  16. annual_income
  17. employment_length
  18. employment_type
  19. education
  20. marital_status
  21. num_dependents
  22. employment_length_missing
  23. credit_score
  24. num_credit_accounts
  25. oldest_credit_line_age
  26. oldest_account_age_months
  27. total_credit_limit
  28. num_delinquencies_2yrs
  29. num_inquiries_6mo
  30. recent_inquiry_count
  31. num_public_records
  32. num_collections
  33. account_diversity_index
  34. loan_type
  35. loan_amount
  36. loan_term
  37. interest_rate
  38. loan_purpose
  39. loan_to_value_ratio
  40. origination_channel
  41. loan_officer_id

In [26]:
# 2. Financial Health Indicators
print("Creating Financial Health Features...")
print("-" * 60)

new_features = []

# Income to loan ratio
if 'annual_income' in df.columns and 'loan_amount' in df.columns:
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    new_features.append('income_to_loan_ratio')
    print("income_to_loan_ratio done")

# Debt to income percentage
if 'debt_to_income_ratio' in df.columns:
    df['debt_to_income_pct'] = df['debt_to_income_ratio'] * 100
    new_features.append('debt_to_income_pct')
    print("debt_to_income_pct done")

# Payment burden and available income
if 'existing_monthly_debt' in df.columns and 'monthly_income' in df.columns:
    df['payment_burden'] = df['existing_monthly_debt'] / (df['monthly_income'] + 1)
    df['available_income'] = df['monthly_income'] - df['existing_monthly_debt']
    new_features.extend(['payment_burden', 'available_income'])
    print("payment_burden done")
    print("available_income done")

# Income stability (2+ years employment)
if 'employment_length' in df.columns:
    df['income_stability'] = (df['employment_length'] > 2).astype(int)
    new_features.append('income_stability')
    print("income_stability done")

print(f"\n {len(new_features)} financial health features created")

Creating Financial Health Features...
------------------------------------------------------------
income_to_loan_ratio done
debt_to_income_pct done
payment_burden done
available_income done
income_stability done

 5 financial health features created


In [27]:
# 3. Credit Behavior Features
print("\nCreating Credit Behavior Features...")
print("-" * 60)

credit_features = []

# Credit score category
if 'credit_score' in df.columns:
    df['credit_score_category'] = pd.cut(df['credit_score'], 
                                          bins=[0, 580, 670, 740, 850],
                                          labels=['poor', 'fair', 'good', 'excellent'])
    credit_features.append('credit_score_category')
    print("credit_score_category done")

# High utilization flag - If you have a $10,000 credit card and owe $8,000, that's 80% utilization
credit_util_cols = [col for col in df.columns if 'credit' in col.lower() and 'utilization' in col.lower()]
if credit_util_cols:
    credit_util_col = credit_util_cols[0]
    df['high_utilization'] = (df[credit_util_col] > 0.7).astype(int)
    credit_features.append('high_utilization')
    print(f"high_utilization (using {credit_util_col}) done")
else:
    print("Skipped high_utilization (column not found)")

# Recent delinquency flag - Delinquency = Late payment (30+ days overdue)

delinq_cols = [col for col in df.columns if 'delinq' in col.lower()]
if delinq_cols:
    delinq_col = delinq_cols[0]
    df['recent_delinquency'] = (df[delinq_col] > 0).astype(int)
    credit_features.append('recent_delinquency')
    print(f"recent_delinquency (using {delinq_col}) done")
else:
    print("Skipped recent_delinquency (column not found)")

# Credit age in years - How long have you had credit accounts?
credit_age_cols = [col for col in df.columns if 'oldest' in col.lower() or 'credit_history' in col.lower()]
if credit_age_cols:
    credit_age_col = credit_age_cols[0]
    if 'month' in credit_age_col.lower():
        df['credit_age_years'] = df[credit_age_col] / 12
    else:
        df['credit_age_years'] = df[credit_age_col]
    credit_features.append('credit_age_years')
    print(f"credit_age_years (using {credit_age_col}) done")
else:
    print("Skipped credit_age_years (column not found)")

print(f"\nCreated {len(credit_features)} credit behavior features")


Creating Credit Behavior Features...
------------------------------------------------------------
credit_score_category done
high_utilization (using credit_utilization) done
recent_delinquency (using num_delinquencies_2yrs) done
credit_age_years (using oldest_credit_line_age) done

Created 4 credit behavior features


In [28]:
# 4. Loan Characteristics Features
print("\nCreating Loan Characteristics Features...")
print("-" * 60)

loan_features = []

# High interest flag - Flag anyone paying above the middle rate as high interest
if 'interest_rate' in df.columns:
    median_rate = df['interest_rate'].median()
    df['high_interest_flag'] = (df['interest_rate'] > median_rate).astype(int)
    loan_features.append('high_interest_flag')
    print(f"high_interest_flag (median rate: {median_rate:.2f}%)")

# Loan to income ratio
if 'loan_amount' in df.columns and 'annual_income' in df.columns:
    df['loan_to_income'] = df['loan_amount'] / (df['annual_income'] + 1)
    loan_features.append('loan_to_income')
    print("loan_to_income")

# Loan type indicators
if 'loan_type' in df.columns:
    unique_types = df['loan_type'].unique()
    print(f"  Available loan types: {', '.join(map(str, unique_types))}")
    
    df['is_mortgage'] = (df['loan_type'] == 'mortgage').astype(int)
    df['is_credit_card'] = (df['loan_type'] == 'credit_card').astype(int)
    loan_features.extend(['is_mortgage', 'is_credit_card'])
    print("is_mortgage")
    print("is_credit_card")

print(f"\nCreated {len(loan_features)} loan characteristic features")


Creating Loan Characteristics Features...
------------------------------------------------------------
high_interest_flag (median rate: 11.29%)
loan_to_income
  Available loan types: personal, mortgage, credit_card
is_mortgage
is_credit_card

Created 4 loan characteristic features


In [29]:
# 5. Risk Aggregate Score
print("\nCreating Risk Aggregate Score...")
print("-" * 60)

risk_components = []
risk_component_names = []

# Check each risk component
if 'high_utilization' in df.columns:
    risk_components.append(df['high_utilization'])
    risk_component_names.append('high_utilization')
    print("Including high_utilization")

if 'recent_delinquency' in df.columns:
    risk_components.append(df['recent_delinquency'])
    risk_component_names.append('recent_delinquency')
    print("Including recent_delinquency")

if 'debt_to_income_ratio' in df.columns:
    high_dti = (df['debt_to_income_ratio'] > 0.43).astype(int)
    risk_components.append(high_dti)
    risk_component_names.append('high_dti (>43%)')
    print("Including high_dti")

if 'credit_score' in df.columns:
    low_credit = (df['credit_score'] < 640).astype(int)
    risk_components.append(low_credit)
    risk_component_names.append('low_credit_score (<640)')
    print("Including low_credit_score")

# Create aggregate score
if len(risk_components) > 0:
    df['total_risk_score'] = sum(risk_components)
    print(f"\n total_risk_score (sum of {len(risk_components)} components)")
    print(f"  Components: {', '.join(risk_component_names)}")
    print(f"  Score range: 0 to {len(risk_components)}")
else:
    print("\n Could not create total_risk_score (no components available)")


Creating Risk Aggregate Score...
------------------------------------------------------------
Including high_utilization
Including recent_delinquency
Including high_dti
Including low_credit_score

 total_risk_score (sum of 4 components)
  Components: high_utilization, recent_delinquency, high_dti (>43%), low_credit_score (<640)
  Score range: 0 to 4


In [30]:
# Intermediate Feature Summary (before target encoding)
print("\n" + "="*70)
print("BASIC FEATURES CREATED")
print("="*70)

original_df = pd.read_csv('main_dataset.csv')
original_cols = original_df.shape[1]
current_cols = df.shape[1]
features_so_far = current_cols - original_cols

print(f"\nOriginal features:  {original_cols}")
print(f"New features:       {features_so_far}")
print(f"Current total:      {current_cols}")

# List basic features created
basic_features = [col for col in df.columns if col not in original_df.columns]
print(f"\nBasic Features Created ({len(basic_features)}):")
for i, feat in enumerate(basic_features, 1):
    dtype = df[feat].dtype
    print(f"  {i:2d}. {feat:30s} ({dtype})")

print(f"\n✓ Proceeding to advanced encoding...")


BASIC FEATURES CREATED

Original features:  64
New features:       14
Current total:      78

Basic Features Created (14):
   1. income_to_loan_ratio           (float64)
   2. debt_to_income_pct             (float64)
   3. payment_burden                 (float64)
   4. available_income               (float64)
   5. income_stability               (int32)
   6. credit_score_category          (category)
   7. high_utilization               (int32)
   8. recent_delinquency             (int32)
   9. credit_age_years               (float64)
  10. high_interest_flag             (int32)
  11. loan_to_income                 (float64)
  12. is_mortgage                    (int32)
  13. is_credit_card                 (int32)
  14. total_risk_score               (int32)

✓ Proceeding to advanced encoding...

Original features:  64
New features:       14
Current total:      78

Basic Features Created (14):
   1. income_to_loan_ratio           (float64)
   2. debt_to_income_pct             (float64)

## 6. Advanced Categorical Encoding (Target Encoding)

In [36]:
# Target encoding for high-cardinality categorical features
from sklearn.model_selection import StratifiedKFold

# Find categorical columns with moderate cardinality
target_encode_candidates = []
for col in df.select_dtypes(include=['object']).columns:
    n_unique = df[col].nunique()
    if 5 < n_unique <= 50 and col not in ['customer_id', 'application_id']:
        target_encode_candidates.append(col)

if len(target_encode_candidates) > 0 and 'default' in df.columns:
    print(f"Found {len(target_encode_candidates)} candidates: {', '.join(target_encode_candidates)}")
    
    # Use 5-fold cross-validation to avoid target leakage
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for col in target_encode_candidates:
        # Initialize encoded column
        df[f'{col}_target_enc'] = 0.0
        global_mean = df['default'].mean()
        
        # Encode using CV to prevent leakage
        for train_idx, val_idx in kfold.split(df, df['default']):
            # Calculate mean target per category on training fold
            means = df.iloc[train_idx].groupby(col)['default'].mean()
            
            # Apply to validation fold with smoothing
            df.loc[val_idx, f'{col}_target_enc'] = df.loc[val_idx, col].map(means).fillna(global_mean)
        
        print(f"{col} -> {col}_target_enc")
    
    print(f"\nCreated {len(target_encode_candidates)} target-encoded features")
else:
    print("No suitable candidates for target encoding or target variable not found")

Found 3 candidates: loan_purpose, marketing_campaign, state
loan_purpose -> loan_purpose_target_enc
loan_purpose -> loan_purpose_target_enc
marketing_campaign -> marketing_campaign_target_enc
marketing_campaign -> marketing_campaign_target_enc
state -> state_target_enc

Created 3 target-encoded features
state -> state_target_enc

Created 3 target-encoded features


In [37]:
# 7. Final Summary & Export
original_df = pd.read_csv('main_dataset.csv')
original_cols = original_df.shape[1]
new_cols = df.shape[1]
created_features = new_cols - original_cols

print(f"\nOriginal features:  {original_cols}")
print(f"New features:       {created_features}")
print(f"Total features:     {new_cols}")

# List all new features by category
new_feature_names = [col for col in df.columns if col not in original_df.columns]

# Categorize features
financial = [f for f in new_feature_names if any(x in f.lower() for x in ['income', 'debt', 'payment', 'stability'])]
credit = [f for f in new_feature_names if any(x in f.lower() for x in ['credit', 'delinq', 'utilization'])]
loan = [f for f in new_feature_names if any(x in f.lower() for x in ['loan', 'interest', 'mortgage'])]
risk = [f for f in new_feature_names if 'risk' in f.lower()]
encoded = [f for f in new_feature_names if 'target_enc' in f]

print(f"\nFeature Breakdown by Category:")
print(f"  Financial Health:     {len(financial)} features")
print(f"  Credit Behavior:      {len(credit)} features")
print(f"  Loan Characteristics: {len(loan)} features")
print(f"  Risk Scores:          {len(risk)} features")
print(f"  Target Encoded:       {len(encoded)} features")

print(f"\nAll New Features ({len(new_feature_names)}):")
for i, feat in enumerate(new_feature_names, 1):
    dtype = df[feat].dtype
    category = "Financial" if feat in financial else "Credit" if feat in credit else "Loan" if feat in loan else "Risk" if feat in risk else "Encoded"
    print(f"  {i:2d}. {feat:35s} ({dtype}) - {category}")

# Check for missing values
print(f"\n Missing Value Check:")
missing_in_new = df[new_feature_names].isnull().sum()
if missing_in_new.sum() == 0:
    print("No missing values in new features")
else:
    for feat in new_feature_names:
        missing = df[feat].isnull().sum()
        if missing > 0:
            pct = (missing / len(df) * 100)
            print(f"{feat}: {missing:,} ({pct:.2f}%)")

# Save final dataset
df.to_csv('dataset_with_features.csv', index=False)
print(f"\n Dataset saved to 'dataset_with_features.csv'")
print(f"   Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")




Original features:  64
New features:       17
Total features:     81

Feature Breakdown by Category:
  Financial Health:     6 features
  Credit Behavior:      5 features
  Loan Characteristics: 5 features
  Risk Scores:          1 features
  Target Encoded:       3 features

All New Features (17):
   1. income_to_loan_ratio                (float64) - Financial
   2. debt_to_income_pct                  (float64) - Financial
   3. payment_burden                      (float64) - Financial
   4. available_income                    (float64) - Financial
   5. income_stability                    (int32) - Financial
   6. credit_score_category               (category) - Credit
   7. high_utilization                    (int32) - Credit
   8. recent_delinquency                  (int32) - Credit
   9. credit_age_years                    (float64) - Credit
  10. high_interest_flag                  (int32) - Loan
  11. loan_to_income                      (float64) - Financial
  12. is_mortgage  