## Logistic Regression Feature Engineering

In [231]:
import os
import polars as pl

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

In [232]:
df = pl.read_csv("../data/raw/loans.csv")

In [233]:
df.head()

LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
str,i64,i64,i64,i64,i64,i64,f64,i64,f64,str,str,str,str,str,str,str,i64
"""I38PQUQS96""",56,85994,50587,520,80,4,15.23,36,0.44,"""Bachelor's""","""Full-time""","""Divorced""","""Yes""","""Yes""","""Other""","""Yes""",0
"""HPSK72WA7R""",69,50432,124440,458,15,1,4.81,60,0.68,"""Master's""","""Full-time""","""Married""","""No""","""No""","""Other""","""Yes""",0
"""C1OZ6DPJ8Y""",46,84208,129188,451,26,3,21.17,24,0.31,"""Master's""","""Unemployed""","""Divorced""","""Yes""","""Yes""","""Auto""","""No""",1
"""V2KKSFM3UN""",32,31713,44799,743,0,3,7.07,24,0.23,"""High School""","""Full-time""","""Married""","""No""","""No""","""Business""","""No""",0
"""EY08JDHTZP""",60,20437,9139,633,8,4,6.51,48,0.73,"""Bachelor's""","""Unemployed""","""Divorced""","""No""","""Yes""","""Auto""","""No""",0


In [234]:
df["Default"].value_counts()

Default,count
i64,u32
0,225694
1,29653


In [235]:
df.schema

Schema([('LoanID', String),
        ('Age', Int64),
        ('Income', Int64),
        ('LoanAmount', Int64),
        ('CreditScore', Int64),
        ('MonthsEmployed', Int64),
        ('NumCreditLines', Int64),
        ('InterestRate', Float64),
        ('LoanTerm', Int64),
        ('DTIRatio', Float64),
        ('Education', String),
        ('EmploymentType', String),
        ('MaritalStatus', String),
        ('HasMortgage', String),
        ('HasDependents', String),
        ('LoanPurpose', String),
        ('HasCoSigner', String),
        ('Default', Int64)])

In [236]:
df["LoanTerm"].unique().to_list()

[12, 24, 36, 48, 60]

In [237]:
df.drop_in_place("LoanID")

LoanID
str
"""I38PQUQS96"""
"""HPSK72WA7R"""
"""C1OZ6DPJ8Y"""
"""V2KKSFM3UN"""
"""EY08JDHTZP"""
…
"""8C6S86ESGC"""
"""98R4KDHNND"""
"""XQK1UUUNGP"""
"""JAO28CPL4H"""


### Numerical features

###### Normalize DTIRatio as the value range is already between 0 and 1.
###### Classify LoanTerm as a categorical feature and one-hot encode it (as it is a discrete variable)
###### All other numerical features require standard scaling

In [238]:
num_feats = [
    "Age",
    "Income",
    "LoanAmount",
    "CreditScore",
    "MonthsEmployed",
    "NumCreditLines",
    "InterestRate",
    "DTIRatio",
]

#### Standard scaling

In [239]:
scale_feats = [x for x in num_feats if x != "DTIRatio"]

df_num = df[scale_feats]
num_arr = df_num.to_numpy()

scaler = StandardScaler()
scaled_arr = scaler.fit_transform(num_arr)
df_scaled = pl.DataFrame(scaled_arr, scale_feats).cast(pl.Float32)
df[df_scaled.columns] = df_scaled

#### Normalization

In [240]:
norm_feats = [x for x in num_feats if x == "DTIRatio"]

df_dtir = df[norm_feats]
dtir_arr = df_dtir.to_numpy()

normalizer = MinMaxScaler(feature_range=(0, 1))
normed_arr = scaler.fit_transform(dtir_arr)
df_normed = pl.DataFrame(normed_arr, norm_feats).cast(pl.Float32)
df[df_normed.columns] = df_normed

#### Ordinal encoding

In [241]:
encoder = OrdinalEncoder()
encoded_arr = encoder.fit_transform(df[["LoanTerm"]])
df[["LoanTerm"]] = encoded_arr
df[["LoanTerm"]] = df[["LoanTerm"]].cast(pl.Int8)

### Categorical features

In [242]:
cat_feats = ["Education", "EmploymentType", "MaritalStatus", "LoanPurpose"]

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=int)
encoder.set_output(transform="polars")
df_cat_enc = encoder.fit_transform(df[cat_feats]).cast(pl.Int8)
df[df_cat_enc.columns] = df_cat_enc
df = df.drop(cat_feats)

### Binary features

In [243]:
bin_feats = ["HasMortgage", "HasDependents", "HasCoSigner"]
bin_map = {"Yes": 1, "No": 0}

for feat in bin_feats:
    df = df.with_columns(pl.col(feat).replace(bin_map).alias(feat).cast(pl.Int8))

In [None]:
tran_dir = "../data/transformed"

if not os.path.exists(tran_dir):
    os.mkdir(tran_dir)

df = pl.concat([df.drop("Default"), df["Default"].to_frame()], how="horizontal")
df.write_csv(f"{tran_dir}/feats_engineered.csv")

### Training model

#### Train/test split

In [None]:
X = df_X.to_numpy()
y = df_y.to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=42
)

#### Handle imbalances

###### SMOTE should only be applied to the training set. NOT the testing set.

In [None]:
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

### Train model

In [None]:
model = LogisticRegression()
model.fit(X_train_bal, y_train_bal)