In [None]:
# importing packages and lib 
import pandas as pd 
import numpy as np

from credit_risk.utils.paths import samples_dir

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


In [3]:
train_df = pd.read_parquet(samples_dir / "train.parquet")
val_df   = pd.read_parquet(samples_dir / "val.parquet")

train_df.shape, val_df.shape

((941716, 31), (201796, 31))

In [4]:
target_col = "target"

X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

X_val = val_df.drop(columns=[target_col])
y_val = val_df[target_col]


In [24]:
X_train["earliest_cr_line"] = pd.to_datetime(
    X_train["earliest_cr_line"],
    errors="coerce"
)

X_val["earliest_cr_line"] = pd.to_datetime(
    X_val["earliest_cr_line"],
    errors="coerce"
)


  X_train["earliest_cr_line"] = pd.to_datetime(
  X_val["earliest_cr_line"] = pd.to_datetime(


In [25]:
# ISSUE DATE
X_train["issue_year"] = X_train["issue_d"].dt.year
X_train["issue_month"] = X_train["issue_d"].dt.month

X_val["issue_year"] = X_val["issue_d"].dt.year
X_val["issue_month"] = X_val["issue_d"].dt.month


# EARLIEST CREDIT LINE â†’ credit history length
X_train["earliest_cr_year"] = X_train["earliest_cr_line"].dt.year
X_val["earliest_cr_year"] = X_val["earliest_cr_line"].dt.year


In [26]:
X_train = X_train.drop(columns=["issue_d", "earliest_cr_line"])
X_val   = X_val.drop(columns=["issue_d", "earliest_cr_line"])


In [27]:
X_train.columns

Index(['addr_state', 'annual_inc', 'application_type', 'dti', 'home_ownership',
       'initial_list_status', 'installment', 'int_rate', 'loan_amnt',
       'mort_acc', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies', 'purpose',
       'revol_bal', 'revol_util', 'sub_grade', 'term', 'total_acc',
       'verification_status', 'emp_length_num', 'emp_length_missing',
       'revol_util_missing', 'mort_acc_missing', 'fico_avg', 'issue_year',
       'issue_month', 'earliest_cr_year'],
      dtype='object')

In [7]:
X_train = X_train.drop(columns=["zip_code"])
X_val   = X_val.drop(columns=["zip_code"])


In [8]:
X_train["fico_avg"] = (X_train["fico_range_low"] + X_train["fico_range_high"]) / 2
X_val["fico_avg"]   = (X_val["fico_range_low"] + X_val["fico_range_high"]) / 2

X_train = X_train.drop(columns=["fico_range_low", "fico_range_high"])
X_val   = X_val.drop(columns=["fico_range_low", "fico_range_high"])


In [None]:

X_train.shape

(941716, 27)

In [11]:
X_train.columns

Index(['addr_state', 'annual_inc', 'application_type', 'dti',
       'earliest_cr_line', 'home_ownership', 'initial_list_status',
       'installment', 'int_rate', 'issue_d', 'loan_amnt', 'mort_acc',
       'open_acc', 'pub_rec', 'pub_rec_bankruptcies', 'purpose', 'revol_bal',
       'revol_util', 'sub_grade', 'term', 'total_acc', 'verification_status',
       'emp_length_num', 'emp_length_missing', 'revol_util_missing',
       'mort_acc_missing', 'fico_avg'],
      dtype='object')

In [31]:
num_features = [
    "annual_inc",
    "dti",
    "installment",
    "int_rate",
    "loan_amnt",
    "revol_bal",
    "revol_util",
    "total_acc",
    "open_acc",
    "mort_acc",
    "emp_length_num",
    "fico_avg",
    "issue_year",
    "issue_month",
    "earliest_cr_year"
]


In [13]:
binary_features = [
    "emp_length_missing",
    "revol_util_missing",
    "mort_acc_missing",
    "pub_rec",
    "pub_rec_bankruptcies"
]


In [15]:
cat_features = [
    "addr_state",
    "application_type",
    "home_ownership",
    "initial_list_status",
    "purpose",
    "sub_grade",
    "term",
    "verification_status"
]


In [17]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [18]:
bin_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
])


In [19]:
date_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", StandardScaler())
])


In [20]:
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=False
    ))
])


In [32]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_features),
        ("bin", bin_pipeline, binary_features),
        ("cat", cat_pipeline, cat_features),
    ],
    remainder="drop"
)


In [33]:
X_train_fe = preprocessor.fit_transform(X_train)
X_val_fe   = preprocessor.transform(X_val)

