In [73]:
# Fix imports when running from notebooks/ folder
import sys
from pathlib import Path

project_root = Path.cwd().parent 
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")

%load_ext autoreload
%autoreload 2

Added to path: /Users/elshaday/DEV/10Academy/credit-risk-probability-week4
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
from src import DataManager, WoeTransformer
from scripts.constants import (
    WOE_CANDIDATE_COLS,
    TARGET_COL,
    PROCESSED_FEATURES_WITH_PROXY_VAR_DATA_FILE_NAME,
    READY_TO_MODEL_DATA_FILE_NAME,
    Columns,
)
from tabulate import tabulate

In [75]:
# Load data frame

dm = DataManager()
processed_df = dm.load_csv(
    load_clean=True, file_name=PROCESSED_FEATURES_WITH_PROXY_VAR_DATA_FILE_NAME
)

Loading ../data/processed/processed_features_with_proxy.csv...
Sucessfully loaded ../data/processed/processed_features_with_proxy.csv!


In [76]:
# Separate data frame with desired cols
customer_id_series = processed_df[Columns.CustomerId.value]
woe_df = processed_df[WOE_CANDIDATE_COLS + [TARGET_COL]]

# Quick Checks on the DF
# No null target
assert woe_df[TARGET_COL].isna().sum() == 0

# Target is binary
assert set(woe_df[TARGET_COL].unique()) <= {0, 1}

# No ID leakage
assert Columns.CustomerId.value not in woe_df.columns

woe_df.info()

print(tabulate(woe_df.head(), headers="keys", tablefmt="grid"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3742 entries, 0 to 3741
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   TotalTransactionAmount      3742 non-null   float64
 1   AverageTransactionAmount    3742 non-null   float64
 2   TransactionCount            3742 non-null   float64
 3   TransactionAmountSTD        3742 non-null   float64
 4   AverageTransactionHour      3742 non-null   float64
 5   MostCommonTransactionDay    3742 non-null   int64  
 6   MostCommonTransactionMonth  3742 non-null   int64  
 7   ActiveYearsCount            3742 non-null   float64
 8   MostCommonProductCategory   3742 non-null   object 
 9   UniqueProductCategoryCount  3742 non-null   int64  
 10  MostCommonChannel           3742 non-null   object 
 11  is_high_risk                3742 non-null   int64  
dtypes: float64(6), int64(4), object(2)
memory usage: 350.9+ KB
+----+-------------------------

In [77]:
woe_transformer = WoeTransformer(woe_df)

In [78]:
# Apply Binning
transformed_df = woe_transformer.fit_transform()

print(tabulate(transformed_df.head(), headers="keys", tablefmt="grid"))

+----+--------------------------+----------------------------+--------------------+------------------------+--------------------------+----------------------------+------------------------------+--------------------+-----------------------------+------------------------------+---------------------+----------------+
|    | TotalTransactionAmount   | AverageTransactionAmount   | TransactionCount   | TransactionAmountSTD   | AverageTransactionHour   | MostCommonTransactionDay   | MostCommonTransactionMonth   | ActiveYearsCount   | MostCommonProductCategory   | UniqueProductCategoryCount   | MostCommonChannel   |   is_high_risk |
|  0 | bin_0                    | bin_0                      | bin_0              | bin_0                  | bin_8                    | bin_6                      | bin_1                        | bin_0              | airtime                     | bin_0                        | ChannelId_2         |              1 |
+----+--------------------------+----------------

In [79]:
iv_df=woe_transformer.get_iv_table()
print(iv_df.sort_index())

                       feature        iv
0       TotalTransactionAmount  3.545801
1     AverageTransactionAmount  0.679145
2             TransactionCount  4.341496
3         TransactionAmountSTD  1.734441
4       AverageTransactionHour  0.812690
5     MostCommonTransactionDay  0.265189
6   MostCommonTransactionMonth  0.162331
7             ActiveYearsCount  0.000000
8    MostCommonProductCategory  0.097455
9   UniqueProductCategoryCount  2.699505
10           MostCommonChannel  0.283153


#### IV Range Table

| IV Range   | Interpretation       |
| ---------- | -------------------- |
| < 0.02     | Not predictive       |
| 0.02 – 0.1 | Weak                 |
| 0.1 – 0.3  | Medium               |
| 0.3 – 0.5  | Strong               |
| > 0.5      | Suspicious / Leakage |

It seems that TransactionCount, TotalTransactionAmount, UniqueProductCategoryCount, TransactionAmountSTD all have high IV values because target was created from RFM these features are strongly correlated with RFM.

In [80]:
# Final Features
final_features = []

for iv_item in iv_df.values:
    if iv_item[1] > 0.1:
        final_features.append(iv_item[0])

print(final_features)

['TransactionCount', 'TotalTransactionAmount', 'UniqueProductCategoryCount', 'TransactionAmountSTD', 'AverageTransactionHour', 'AverageTransactionAmount', 'MostCommonChannel', 'MostCommonTransactionDay', 'MostCommonTransactionMonth']


In [82]:
# Replace bins with WoE values
woe_df = woe_transformer.transform_to_woe()
woe_df = woe_df.assign(CustomerId=customer_id_series.values)
print(woe_df.isna().sum())

# Feature selection
model_df = woe_df[[Columns.CustomerId.value] + final_features + [TARGET_COL]]

# Final Data Ready for Modeling
print(tabulate(model_df.head(), headers="keys", tablefmt="grid"))

dm.save_to_csv(df=model_df, file_name=READY_TO_MODEL_DATA_FILE_NAME, )

TotalTransactionAmount        0
AverageTransactionAmount      0
TransactionCount              0
TransactionAmountSTD          0
AverageTransactionHour        0
MostCommonTransactionDay      0
MostCommonTransactionMonth    0
ActiveYearsCount              0
MostCommonProductCategory     0
UniqueProductCategoryCount    0
MostCommonChannel             0
is_high_risk                  0
CustomerId                    0
dtype: int64
+----+-----------------+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+----------------------------+------------------------------+----------------+
|    | CustomerId      |   TransactionCount |   TotalTransactionAmount |   UniqueProductCategoryCount |   TransactionAmountSTD |   AverageTransactionHour |   AverageTransactionAmount |   MostCommonChannel |   MostCommonTransactionDay |   MostCommonTransactionMonth |   is_high_risk |
|  