In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import re

# Load data
df = pd.read_csv("loan_data.csv", low_memory=False)

In [None]:
# Drop irrelevant columns 
df = df[[
    'loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
    'annual_inc', 'purpose', 'issue_d', 'loan_status', 'dti',
    'revol_util', 'open_acc', 'pub_rec', 'total_acc'
]]


In [None]:
#  Target Encoding
df = df[df["loan_status"].isin(["Fully Paid", "Charged Off"])]  # Binary classification
df["target"] = df["loan_status"].map({"Fully Paid": 0, "Charged Off": 1})
df.drop("loan_status", axis=1, inplace=True)

In [None]:
#  Clean and Convert Columns 
df["int_rate"] = df["int_rate"].str.replace('%', '').astype(float)
df["revol_util"] = df["revol_util"].str.replace('%', '').astype(float)
df["term"] = df["term"].str.extract('(\d+)').astype(int)

In [None]:

# Extract year/month from issue_d
df["issue_d"] = pd.to_datetime(df["issue_d"])
df["issue_year"] = df["issue_d"].dt.year
df["issue_month"] = df["issue_d"].dt.month
df.drop("issue_d", axis=1, inplace=True)

In [None]:
# Clean emp_length
def clean_emp_length(val):
    if pd.isnull(val): return np.nan
    if "< 1 year" in val: return 0
    if "10+" in val: return 10
    return int(re.search(r'\d+', val).group())

df["emp_length"] = df["emp_length"].apply(clean_emp_length)

In [None]:
# Missing Values 
imputer = SimpleImputer(strategy='median')
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = imputer.fit_transform(df[num_cols])

In [None]:
#   5. Feature Engineering 
df["loan_to_income"] = df["loan_amnt"] / (df["annual_inc"] + 1)
df["installment_rate"] = df["loan_amnt"] * (df["int_rate"] / 100)



In [None]:
#   6. Categorical Encoding  
df = pd.get_dummies(df, columns=["purpose", "home_ownership"], drop_first=True)



In [None]:
#   7. Scale Features  
features = df.drop("target", axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)
X = pd.DataFrame(X_scaled, columns=features.columns)
y = df["target"]