# üè¶ Home Credit Default Risk 
## Notebook 2: EDA & Preprocessing
**Nama:** [Faisal Soultan Muhammad]

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Setup
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
print("‚úÖ Notebook 2: EDA Started")

‚úÖ Notebook 2: EDA Started


## 1. Load Saved Data

In [2]:
import pickle

# Load data from Notebook 1
with open('app_train.pkl', 'rb') as f:
    app_train = pickle.load(f)

with open('metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

print("üìä DATA LOADED:")
print(f"‚Ä¢ Shape: {app_train.shape}")
print(f"‚Ä¢ Target distribution: {metadata['target_counts']}")

üìä DATA LOADED:
‚Ä¢ Shape: (307511, 122)
‚Ä¢ Target distribution: {0: 282686, 1: 24825}


## 2. EDA Function

In [7]:
def fast_eda(df, target_col='TARGET'):
    print("üîç EDA ANALYSIS")
    print("="*60)
    
    results = {}
    
    # 1. Target analysis
    if target_col in df.columns:
        target_mean = df[target_col].mean()
        print(f"1. Target mean: {target_mean:.3f} ({target_mean*100:.1f}% default)")
        results['target_mean'] = target_mean
    
    # 2. Missing values quick fix
    missing_pct = df.isnull().mean() * 100
    high_missing = missing_pct[missing_pct > 50].index.tolist()
    
    print(f"\n2. Missing values:")
    print(f"   ‚Ä¢ Columns >50% missing: {len(high_missing)}")
    if high_missing:
        print(f"   ‚Ä¢ Will drop: {high_missing[:3]}..." if len(high_missing)>3 else f"   ‚Ä¢ Will drop: {high_missing}")
    
    # 3. Column types
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    print(f"\n3. Data types:")
    print(f"   ‚Ä¢ Numerical: {len(num_cols)} columns")
    print(f"   ‚Ä¢ Categorical: {len(cat_cols)} columns")
    
    results.update({
        'high_missing': high_missing,
        'num_cols': num_cols,
        'cat_cols': cat_cols
    })
    
    return results

# Run EDA
eda_results = fast_eda(app_train)

üîç EDA ANALYSIS
1. Target mean: 0.081 (8.1% default)

2. Missing values:
   ‚Ä¢ Columns >50% missing: 41
   ‚Ä¢ Will drop: ['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG']...

3. Data types:
   ‚Ä¢ Numerical: 106 columns
   ‚Ä¢ Categorical: 16 columns


## 3. Data Cleaning 

In [8]:
print("\n" + "="*60)
print("üßπ DATA CLEANING")
print("="*60)

# Create copy for processing
df = app_train.copy()

# 1. Drop columns with >50% missing
cols_to_drop = eda_results['high_missing']
if cols_to_drop:
    df = df.drop(columns=cols_to_drop)
    print(f"‚úÖ Dropped {len(cols_to_drop)} columns with >50% missing")

# 2. Fill missing numerical values with median
num_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col != 'TARGET']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
print(f"‚úÖ Filled missing numerical values with median")

# 3. Fill missing categorical values with mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown')
print(f"‚úÖ Filled missing categorical values with mode")

# Check remaining missing
remaining_missing = df.isnull().sum().sum()
print(f"‚úÖ Remaining missing values: {remaining_missing}")


üßπ DATA CLEANING
‚úÖ Dropped 41 columns with >50% missing
‚úÖ Filled missing numerical values with median
‚úÖ Filled missing categorical values with mode
‚úÖ Remaining missing values: 0


## 4. Categorical Encoding - SIMPLE

In [9]:
print("\n" + "="*60)
print("üî§ CATEGORICAL ENCODING")
print("="*60)

# Label encoding for all categorical columns (fastest approach)
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    print(f"‚Ä¢ Encoded: {col} ({df[col].nunique()} unique values)")

print(f"\n‚úÖ Encoded {len(cat_cols)} categorical columns")


üî§ CATEGORICAL ENCODING
‚Ä¢ Encoded: NAME_CONTRACT_TYPE (2 unique values)
‚Ä¢ Encoded: CODE_GENDER (3 unique values)
‚Ä¢ Encoded: FLAG_OWN_CAR (2 unique values)
‚Ä¢ Encoded: FLAG_OWN_REALTY (2 unique values)
‚Ä¢ Encoded: NAME_TYPE_SUITE (7 unique values)
‚Ä¢ Encoded: NAME_INCOME_TYPE (8 unique values)
‚Ä¢ Encoded: NAME_EDUCATION_TYPE (5 unique values)
‚Ä¢ Encoded: NAME_FAMILY_STATUS (6 unique values)
‚Ä¢ Encoded: NAME_HOUSING_TYPE (6 unique values)
‚Ä¢ Encoded: OCCUPATION_TYPE (18 unique values)
‚Ä¢ Encoded: WEEKDAY_APPR_PROCESS_START (7 unique values)
‚Ä¢ Encoded: ORGANIZATION_TYPE (58 unique values)
‚Ä¢ Encoded: EMERGENCYSTATE_MODE (2 unique values)

‚úÖ Encoded 13 categorical columns


## 5. Handle Class Imbalance

In [6]:
print("\n" + "="*60)
print("‚öñÔ∏è HANDLING CLASS IMBALANCE")
print("="*60)

# Use class weights 
from sklearn.utils.class_weight import compute_class_weight

y = df['TARGET']
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
weight_dict = {0: class_weights[0], 1: class_weights[1]}

print(f"Class weights computed:")
print(f"‚Ä¢ Class 0 (Non-Default): weight = {weight_dict[0]:.2f}")
print(f"‚Ä¢ Class 1 (Default): weight = {weight_dict[1]:.2f}")


‚öñÔ∏è HANDLING CLASS IMBALANCE
Class weights computed:
‚Ä¢ Class 0 (Non-Default): weight = 0.54
‚Ä¢ Class 1 (Default): weight = 6.19


## 6. Feature Selection

In [10]:
print("\n" + "="*60)
print("üéØ QUICK FEATURE SELECTION")
print("="*60)

# Select features with correlation to target
correlations = {}
for col in df.columns:
    if col != 'TARGET' and df[col].dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]:
        corr = df[col].corr(df['TARGET'])
        correlations[col] = abs(corr)

# Top 30 features by correlation
top_features = sorted(correlations.items(), key=lambda x: x[1], reverse=True)[:30]
top_feature_names = [f[0] for f in top_features]

print(f"Selected top 30 features by correlation with target:")
for i, (feat, corr) in enumerate(top_features[:10], 1):
    print(f"{i:2}. {feat:30} | correlation: {corr:.4f}")
print("   ... and 20 more")

# Create final dataset with selected features
X = df[top_feature_names]
y = df['TARGET']

print(f"\n‚úÖ Final dataset shape: X={X.shape}, y={y.shape}")


üéØ QUICK FEATURE SELECTION
Selected top 30 features by correlation with target:
 1. EXT_SOURCE_2                   | correlation: 0.1603
 2. EXT_SOURCE_3                   | correlation: 0.1559
 3. DAYS_BIRTH                     | correlation: 0.0782
 4. REGION_RATING_CLIENT_W_CITY    | correlation: 0.0609
 5. REGION_RATING_CLIENT           | correlation: 0.0589
 6. DAYS_LAST_PHONE_CHANGE         | correlation: 0.0552
 7. NAME_EDUCATION_TYPE            | correlation: 0.0547
 8. CODE_GENDER                    | correlation: 0.0547
 9. DAYS_ID_PUBLISH                | correlation: 0.0515
10. REG_CITY_NOT_WORK_CITY         | correlation: 0.0510
   ... and 20 more

‚úÖ Final dataset shape: X=(307511, 30), y=(307511,)


## 7. Train/Test Split

In [11]:
print("\n" + "="*60)
print("üìä TRAIN/TEST SPLIT")
print("="*60)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Testing set: {X_test.shape[0]:,} samples")
print(f"Feature count: {X_train.shape[1]}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Features scaled with StandardScaler")


üìä TRAIN/TEST SPLIT
Training set: 246,008 samples
Testing set: 61,503 samples
Feature count: 30
‚úÖ Features scaled with StandardScaler


## 8. Save Processed Data

In [12]:
print("\n" + "="*60)
print("üíæ SAVING PROCESSED DATA")
print("="*60)

# Save processed data
processed_data = {
    'X_train': X_train_scaled,
    'X_test': X_test_scaled,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': top_feature_names,
    'class_weights': weight_dict,
    'scaler': scaler
}

with open('processed_data.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

print("‚úÖ Saved: processed_data.pkl")

# Save feature info
feature_info = pd.DataFrame({
    'feature': top_feature_names,
    'correlation_with_target': [correlations[f] for f in top_feature_names]
})
feature_info.to_csv('top_features.csv', index=False)
print("‚úÖ Saved: top_features.csv")


üíæ SAVING PROCESSED DATA
‚úÖ Saved: processed_data.pkl
‚úÖ Saved: top_features.csv


## üéØ READY FOR MODELLING!

**Notebook 2 selesai**

**Lanjut ke Notebook 3:** `03_model_training.ipynb`

In [14]:
print("\n" + "="*60)
print("‚úÖ NOTEBOOK 2 COMPLETED!")
print("="*60)
print("\n‚úÖ Data cleaned and preprocessed")
print("‚úÖ Categorical variables encoded")
print("‚úÖ Feature selection completed")
print("‚úÖ Train/test split created")
print("\n‚û°Ô∏è  NEXT: Create '03_model_training.ipynb'")


‚úÖ NOTEBOOK 2 COMPLETED!

‚úÖ Data cleaned and preprocessed
‚úÖ Categorical variables encoded
‚úÖ Feature selection completed
‚úÖ Train/test split created

‚û°Ô∏è  NEXT: Create '03_model_training.ipynb'
