In [15]:
import pandas as pd
import os

# Change the working directory to the project root
os.chdir('/Users/dustinhayes/Desktop/GitHub/stable-credit-risk-modeling')

from pathlib import Path
from DataProcessing.pipeline import Pipeline

import polars as pl
from numpy import sqrt

import global_config
from global_config import data_store, CORE_COLUMNS, FEATURE_REPORT_PATH



In [21]:
feature_mapping_path = FEATURE_REPORT_PATH / Path("FeatureMapping.csv")
num_report_path = FEATURE_REPORT_PATH / Path("NumericalReport.csv")
cat_report_path = FEATURE_REPORT_PATH / Path("CategoricalReport.csv")

feature_mapping_df = pd.read_csv(feature_mapping_path)
num_report = pd.read_csv(num_report_path)
cat_report = pd.read_csv(cat_report_path)

base_df = pl.read_parquet(data_store['df_base'])

initial_feature_list = feature_mapping_df["Feature"].to_list()


17
6


There are many features in this problem. Let's try and filter down to a lower number based on the following criterion:

- Features with a high number of nulls should be removed
- Features with unsuitable cardinality should be removed
- Features that bad metrics according to feature report eg bad chi2, low information val, low variance

In [3]:
def retrieve_feature(feature):
    depth = feature_mapping_df.loc[feature_mapping_df["Feature"] == feature, "Depth"].iloc[0]
    source = feature_mapping_df.loc[feature_mapping_df["Feature"] == feature, "Source"].iloc[0]
    read_function = Pipeline.read_files if "*" in source else Pipeline.read_file
    df = read_function(source, col_set=[feature, 'case_id']).pipe(Pipeline.apply_pipeline, base_df=base_df, depth=depth)
    if not [x for x in df.columns if x not in global_config.CORE_COLUMNS]:
        print(f"Skipping {feature} due to NA ratio.") # TODO: Make more robust by checking NA ratio against feature report
        return None # Dropped by read_function due to NA ratio
    df = df.to_pandas()
    return df

In [12]:
print(num_report.columns)
print(cat_report.columns)

Index(['Unnamed: 0', 'PctNA', 'mean', 'mean_no_default', 'mean_default',
       'median', 'median_no_default', 'median_default', 'min', 'max',
       'variance', 'variance_no_default', 'variance_default', 'Skewedness',
       'Kurtosis'],
      dtype='object')
Index(['Unnamed: 0', 'NumUnique', 'PctNA', 'IV', 'Significant', 'P-val',
       'Chi2'],
      dtype='object')


In [23]:
def remove_due_to_nullness(features, threshold):
    cols = ['Unnamed: 0', 'PctNA'] # TODO: Fix Unnamed: 0 - that is the col that holds feature names
    na_report = pd.concat([num_report[cols], cat_report[cols]])
    to_remove = na_report.loc[na_report['PctNA'] > threshold, 'Unnamed: 0'].to_list()
    print(f"Removing columns: {to_remove}")
    print(f"Removed {len(to_remove)} features.")
    filtered_features = [x for x in features if x not in to_remove]
    return filtered_features

def remove_due_to_signal_to_noise(features, threshold):
    """SN_ratio = abs(mean_1 - mean_2)/sqr(var_1 + var_2)"""
    cols = ['Unnamed: 0', 'mean_default', 'mean_no_default', 'variance_default', 'variance_no_default']
    sn_df = num_report[cols]
    sn_df["Signal_to_Noise"] = (
        abs(sn_df['mean_default'] - sn_df['mean_no_default'])/(
            sqrt(sn_df['variance_default'] + sn_df['variance_no_default']) + 10e-9
        )
    )
    print(sn_df["Signal_to_Noise"])

print(len(num_report))
print(len(cat_report))
remove_due_to_signal_to_noise(1,1)

17
6
0     2.378173e-01
1     2.166494e-01
2     2.522292e-01
3     2.498141e-01
4     2.159801e-01
5     2.216460e-01
6     2.482547e-01
7     1.333047e-01
8     1.124712e-01
9     2.216460e-01
10    1.780736e-01
11    2.209667e-01
12    4.000853e-02
13    4.436361e-02
14    1.079109e-01
15    9.225718e-02
16    1.000000e+08
Name: Signal_to_Noise, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sn_df["Signal_to_Noise"] = (
