In [35]:
import pandas as pd
import polars as pl
import os

from glob import glob

from utils.selection_tools import manual_balancing

## How-To for Manual Balancing with Random Sampling

In [2]:
base_file_path = '../data/raw/parquet_files/train/train_base.parquet'

In [3]:
# Function to randomly sample and return a balanced base_df
base_df = manual_balancing(base_file_path=base_file_path, random_state=28)

In [4]:
base_df['target'].value_counts()

target
1    47994
0    47994
Name: count, dtype: int64

In [47]:
len(base_df)

95988

### Depth 1

Now we can merge on files with perfectly balanced classes.  However, merging is easier with polars `outer_coalesce`.

In [48]:
base_pdf = pl.from_pandas(base_df)

In [51]:
data_dir = '../data/raw/parquet_files/train/'
file_group = 'train_debitcard_1'    # Depth 1
file_paths = glob(os.path.join(data_dir, file_group) + '*')

dc_df = pl.DataFrame()
for path in file_paths:
    temp = pl.read_parquet(path)
    temp = temp.filter(pl.col('case_id').is_in(base_pdf['case_id']))
    dc_df = pl.concat([dc_df, temp], how='vertical')    # We are adding rows

In [52]:
dc_df.shape

(8448, 6)

In [53]:
df = base_pdf.join(dc_df, on='case_id', how='outer_coalesce')

Certain rows will be populated with missing values as the `case_id` is not associated with any debit card data.

In [54]:
df.head()

case_id,date_decision,MONTH,WEEK_NUM,target,last180dayaveragebalance_704A,last180dayturnover_1134A,last30dayturnover_651A,num_group1,openingdate_857D
i64,str,i64,i64,i64,f64,f64,f64,i64,str
4,"""2019-01-04""",201901,0,1,,,,,
7,"""2019-01-03""",201901,0,0,,,,,
72,"""2019-01-03""",201901,0,0,,,,,
78,"""2019-01-03""",201901,0,0,,,,,
101,"""2019-01-03""",201901,0,1,,,,,


In [56]:
df['case_id'].n_unique()

95988

In [57]:
df.shape

(98259, 10)

In [58]:
base_df.shape

(95988, 5)

### Depth 2

In [60]:
file_group = 'train_credit_bureau_a_2'    # Depth 2
file_paths = glob(os.path.join(data_dir, file_group) + '*')

cb_df = pl.DataFrame()
for path in file_paths:
    temp = pl.read_parquet(path)
    temp = temp.filter(pl.col('case_id').is_in(base_pdf['case_id']))
    cb_df = pl.concat([cb_df, temp], how='vertical')

# Free memory
del temp

In [61]:
cb_df.shape

(11084656, 19)

In [62]:
new_df = base_pdf.join(cb_df, on='case_id', how='outer_coalesce')

In [63]:
new_df.shape    # Added rows of case_ids with no credit reports

(11093601, 23)

In [64]:
new_df['case_id'].n_unique()    # Still the same number of case_ids

95988