## Random Forest Feature Engineering

In [48]:
import pandas as pd
import numpy as np
from scipy.stats import skew

In [49]:
df_src = pd.read_csv("loans.csv")

In [50]:
cols_map = {}

for col in df_src.columns:
    cols_map[col] = col.lower().replace("/", "_")

df_src.rename(columns=cols_map, inplace=True)

In [51]:
print(df_src['risk_flag'].value_counts(normalize=True))
print("----------------------")
print("----------------------")
print(df_src.info())

risk_flag
0    0.877
1    0.123
Name: proportion, dtype: float64
----------------------
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252000 entries, 0 to 251999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   id                 252000 non-null  int64 
 1   income             252000 non-null  int64 
 2   age                252000 non-null  int64 
 3   experience         252000 non-null  int64 
 4   married_single     252000 non-null  object
 5   house_ownership    252000 non-null  object
 6   car_ownership      252000 non-null  object
 7   profession         252000 non-null  object
 8   city               252000 non-null  object
 9   state              252000 non-null  object
 10  current_job_yrs    252000 non-null  int64 
 11  current_house_yrs  252000 non-null  int64 
 12  risk_flag          252000 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 25.0+ MB
None


In [52]:
type_map = {
    "income": "int8", 
    "age": "int8", 
    "experience": "int8",
    "married_single": "string",
    "house_ownership": "string",
    "car_ownership": "string",
    "profession": "string",
    "city": "string",
    "state": "string",
    "current_job_yrs": "int8",
    "current_house_yrs": "int8",
    "risk_flag": "int8"
}

for col, dtype in type_map.items():
    df_src[col] = df_src[col].astype(dtype)

df_src.dtypes

id                            int64
income                         int8
age                            int8
experience                     int8
married_single       string[python]
house_ownership      string[python]
car_ownership        string[python]
profession           string[python]
city                 string[python]
state                string[python]
current_job_yrs                int8
current_house_yrs              int8
risk_flag                      int8
dtype: object

In [53]:
df_src.drop(labels=["id"], inplace=True, axis=1)

In [54]:
# Remove duplicate rows
df_src.drop_duplicates(inplace=True)

if df_src.duplicated().any() == np.False_:
    # log duplicates to an output table
    print("Duplicates have been removed")

Duplicates have been removed


In [55]:
# Log-transform skewed features
# This should be logged to an output
skew_val = skew(df_src["income"])
print(skew_val)

-0.015994971955553557


In [56]:
# Assess skew values
for col in ['age', 'experience', 'current_job_yrs', 'current_house_yrs']:
    print(f"{col} skew: {skew(df_src[col].dropna()):.2f}")

age skew: 0.01
experience skew: 0.01
current_job_yrs skew: 0.26
current_house_yrs skew: -0.01


In [57]:
df_eng = df_src

In [58]:
# Feature engineering
df_eng["job_stability"] = df_eng["current_job_yrs"] / df_eng["experience"]
df_eng["moved_recently"] = (df_eng["current_house_yrs"] < 1).astype(int)
df_eng['age_bucket'] = pd.cut(df_eng['age'], bins=[18, 25, 35, 45, 60, 100], labels=['18-24', '25-34', '35-44', '45-59', '60+'])

# When using Random Forest scaling is not required
# When using Linear models scaling is recommended
# This should be handled when users are given the opportunity to select between models in the frontend

In [59]:
# RF handles integers well, for features where there are two options, use label encoding.
# For features that use more than 2, use one-hot
df_enc = df_eng

In [60]:
# Label Encoding
# We can automate this - check if field is categorical and there are two options
for col, enc in {"married_single": {'single': 0, 'married': 1}, "car_ownership": {'no': 0, 'yes': 1}}.items():
    df_enc[col] = df_enc[col].map(enc).astype("int8")

In [61]:
# Check cardinality
# < 10 use one-hot
# 10-50 use one-hot or frequency (can be a feature on the front end)
# > 50 use frequency or target encoding (can be a feature on the front end)
for col in ["city", "state", "house_ownership", "profession", "age_bucket"]:
    unique_vals = df_eng[col].nunique()
    print(f"{col}: {unique_vals} unique categories")

city: 317 unique categories
state: 29 unique categories
house_ownership: 3 unique categories
profession: 51 unique categories
age_bucket: 5 unique categories


In [62]:
# One-hot Encoding for features with low cardinality
for col in ["age_bucket", "house_ownership"]:
    df_enc = pd.get_dummies(df_enc, columns=[col], drop_first=True, dtype=int)

In [63]:
# Frequency encoding for high cardinality
for col in ["city", "state", "profession"]:
    freq = df_enc[col].value_counts(normalize=True)
    df_enc[f'{col}_freq'] = df_enc[col].map(freq)
    df_enc.drop(columns=[col], inplace=True)

In [64]:
cols_map = {}

for col in df_enc.columns:
    cols_map[col] = col.lower()

df_enc.rename(columns=cols_map, inplace=True)

In [65]:
df_enc.head()

Unnamed: 0,income,age,experience,married_single,car_ownership,current_job_yrs,current_house_yrs,risk_flag,job_stability,moved_recently,age_bucket_25-34,age_bucket_35-44,age_bucket_45-59,age_bucket_60+,house_ownership_owned,house_ownership_rented,city_freq,state_freq,profession_freq
0,26,23,3,0,0,3,13,0,1.0,0,0,0,0,0,0,1,0.003195,0.056911,0.019981
1,-12,40,10,0,0,9,13,0,0.9,0,0,1,0,0,0,1,0.002917,0.100486,0.02019
2,7,66,4,1,0,4,10,0,1.0,0,0,0,0,1,0,1,0.002894,0.022968,0.01975
3,67,41,2,0,1,2,12,1,1.0,0,0,1,0,0,0,1,0.002987,0.019403,0.02019
4,-89,47,11,0,0,3,14,1,0.272727,0,0,0,1,0,0,1,0.003427,0.064969,0.019935
