## Logistic Regression Feature Engineering

In [37]:
import polars as pl

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [38]:
df = pl.read_csv("../data/loans.csv")

In [39]:
df.head()

LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
str,i64,i64,i64,i64,i64,i64,f64,i64,f64,str,str,str,str,str,str,str,i64
"""I38PQUQS96""",56,85994,50587,520,80,4,15.23,36,0.44,"""Bachelor's""","""Full-time""","""Divorced""","""Yes""","""Yes""","""Other""","""Yes""",0
"""HPSK72WA7R""",69,50432,124440,458,15,1,4.81,60,0.68,"""Master's""","""Full-time""","""Married""","""No""","""No""","""Other""","""Yes""",0
"""C1OZ6DPJ8Y""",46,84208,129188,451,26,3,21.17,24,0.31,"""Master's""","""Unemployed""","""Divorced""","""Yes""","""Yes""","""Auto""","""No""",1
"""V2KKSFM3UN""",32,31713,44799,743,0,3,7.07,24,0.23,"""High School""","""Full-time""","""Married""","""No""","""No""","""Business""","""No""",0
"""EY08JDHTZP""",60,20437,9139,633,8,4,6.51,48,0.73,"""Bachelor's""","""Unemployed""","""Divorced""","""No""","""Yes""","""Auto""","""No""",0


In [40]:
df["Default"].value_counts()

Default,count
i64,u32
0,225694
1,29653


In [41]:
df.schema

Schema([('LoanID', String),
        ('Age', Int64),
        ('Income', Int64),
        ('LoanAmount', Int64),
        ('CreditScore', Int64),
        ('MonthsEmployed', Int64),
        ('NumCreditLines', Int64),
        ('InterestRate', Float64),
        ('LoanTerm', Int64),
        ('DTIRatio', Float64),
        ('Education', String),
        ('EmploymentType', String),
        ('MaritalStatus', String),
        ('HasMortgage', String),
        ('HasDependents', String),
        ('LoanPurpose', String),
        ('HasCoSigner', String),
        ('Default', Int64)])

In [42]:
df.drop_in_place("LoanID")

LoanID
str
"""I38PQUQS96"""
"""HPSK72WA7R"""
"""C1OZ6DPJ8Y"""
"""V2KKSFM3UN"""
"""EY08JDHTZP"""
…
"""8C6S86ESGC"""
"""98R4KDHNND"""
"""XQK1UUUNGP"""
"""JAO28CPL4H"""


### Numerical features

###### Normalize DTIRatio as the value range is already between 0 and 1.
###### Classify LoanTerm as a categorical feature and one-hot encode it (as it is a discrete variable)
###### All other numerical features require standard scaling

In [43]:
num_feats = [
    "Age",
    "Income",
    "LoanAmount",
    "CreditScore",
    "MonthsEmployed",
    "NumCreditLines",
    "InterestRate",
    "DTIRatio",
]

#### Standard scaling

In [44]:
scale_feats = [x for x in num_feats if x != "DTIRatio"]

df_num = df[scale_feats]
num_arr = df_num.to_numpy()

scaler = StandardScaler()
scaled_arr = scaler.fit_transform(num_arr)
df_scaled = pl.DataFrame(scaled_arr, scale_feats).cast(pl.Float32)

#### Normalization

In [45]:
norm_feats = [x for x in num_feats if x == "DTIRatio"]

df_dtir = df[norm_feats]
dtir_arr = df_dtir.to_numpy()

normalizer = MinMaxScaler(feature_range=(0, 1))
normed_arr = scaler.fit_transform(dtir_arr)
df_normed = pl.DataFrame(normed_arr, norm_feats).cast(pl.Float32)

### Categorical features

In [46]:
cat_feats = ["Education", "EmploymentType", "MaritalStatus", "LoanPurpose"]

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=int)
encoder.set_output(transform="polars")
df_cat_enc = encoder.fit_transform(df[cat_feats]).cast(pl.Int8)

### Binary features

In [47]:
bin_feats = ["HasMortgage", "HasDependents", "HasCoSigner"]
bin_map = {"Yes": 1, "No": 0}

df_bin_enc = df[bin_feats]

for feat in bin_feats:
    df_bin_enc = df_bin_enc.with_columns(
        pl.col(feat).replace(bin_map).alias(feat).cast(pl.Int8)
    )

In [54]:
df_x = pl.concat([df_scaled, df_normed, df_cat_enc, df_bin_enc], how="horizontal")
df_y = df["Default"].cast(pl.Int8)

### Model

In [50]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_feat),
        ("dtir", dtir_transformer, dtir_feat),
        ("cat", categorical_transformer, categorical_feat),
    ],
    remainder="passthrough",
)

NameError: name 'numerical_transformer' is not defined

In [None]:
pipeline = ImbPipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("classifier", LogisticRegression()),
    ]
)

In [None]:
X = df_loans.drop(columns="Default")
y = df_loans["Default"]

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
pipeline.fit(X_train, y_train)