## Logistic Regression Feature Engineering

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
df_loans = pd.read_csv("../data/loans.csv")

In [3]:
df_loans.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [4]:
df_loans["Default"].value_counts()

Default
0    225694
1     29653
Name: count, dtype: int64

In [5]:
df_loans.drop(columns=["LoanID"], inplace=True, axis=1)

### Numerical features

###### Normalize DTIRatio as the value range is already between 0 and 1.
###### Classify LoanTerm as a categorical feature and one-hot encode it (as it is a discrete variable)
###### All other numerical features require standard scaling

#### Standard scaling

In [6]:
numerical_feat = [
    "Age",
    "Income",
    "LoanAmount",
    "CreditScore",
    "MonthsEmployed",
    "NumCreditLines",
    "InterestRate",
]

numerical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

#### Normalization

In [7]:
dtir_feat = ["DTIRatio"]

dtir_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("minmax", MinMaxScaler())]
)

### Categorical features

In [8]:
categorical_feat = ["Education", "EmploymentType", "MaritalStatus", "LoanPurpose"]

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

### Binary features

In [9]:
binary_columns = ["HasMortgage", "HasDependents", "HasCoSigner"]
df_loans[binary_columns] = (
    df_loans[binary_columns].replace({"Yes": 1, "No": 0}).astype(int)
)

  df_loans[binary_columns].replace({"Yes": 1, "No": 0}).astype(int)


### Model

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_feat),
        ("dtir", dtir_transformer, dtir_feat),
        ("cat", categorical_transformer, categorical_feat),
    ],
    remainder="passthrough",
)

In [11]:
pipeline = ImbPipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("classifier", LogisticRegression()),
    ]
)

In [12]:
X = df_loans.drop(columns="Default")
y = df_loans["Default"]

In [13]:
X.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,1,1,Other,1
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,0,0,Other,1
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,1,1,Auto,0
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,0,0,Business,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,0,1,Auto,0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).

