In [33]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import polars as pl
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
import os
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

from utils.merge_tools import merge_n_case_ids

## Data Preprocessing

In [5]:
data_dir = 'D:/Kaggle_Credit_Risk_Predictions/new_aggs/new_aggs'
base_file = 'D:/Kaggle_Credit_Risk_Predictions/parquet_files/train/train_base.parquet'

In [13]:
def merge_n_case_ids(
    n_ids: int = 1000,
    data_dir: str = 'D:/Kaggle_Credit_Risk_Predictions/new_aggs/new_aggs',
    path_to_base: str = 'D:/Kaggle_Credit_Risk_Predictions/parquet_files/train/train_base.parquet',
    use_0: bool = True,
    as_pandas: bool = True,
    random_state: int = 28
) -> pl.DataFrame | pd.DataFrame:
    '''
    Function to merge selected case_id from parquet files, returns subset case_id from the merged dataset.

    Parameters
    ----------
    n_ids : Number of case_ids to sample initially and return (int)
    data_dir : Path to processed parquet files directory (str)
    path_to_base : Path to base file (str)
    use_0 : Use num_group1 == 0 (bool)
    as_pandas : Return as pandas DataFrame
    random_seed : Random seed (int)
    '''
    # Read the base dataframe and sample case_ids
    if as_pandas:
        base_df = pd.read_parquet(path_to_base)
        case_ids = base_df['case_id'].sample(n=n_ids, replace=False, random_state=random_state).tolist()
    else:
        base_df = pl.read_parquet(path_to_base)
        case_ids = base_df['case_id'].sample(n=n_ids, replace=False, seed=random_state).to_list()

    # Define the file pattern for fetching files
    file_pattern = '*grouped_0.parquet' if use_0 else '*grouped_rest.parquet'
    file_paths = glob(data_dir + '/' + file_pattern)

    # Initialize the merged DataFrame filtering the base DataFrame
    if as_pandas:
        df = base_df[base_df['case_id'].isin(case_ids)]
    else:
        df = base_df.filter(pl.col('case_id').is_in(case_ids))

    # Merge DataFrames with only the selected case_ids
    for path in file_paths:
        if as_pandas:
            temp = pd.read_parquet(path)
            temp = temp[temp['case_id'].isin(case_ids)]
            df = pd.merge(df, temp, on='case_id', how='outer')
        else:
            temp = pl.read_parquet(path)
            temp = temp.filter(pl.col('case_id').is_in(case_ids))
            df = df.join(temp, on='case_id', how='outer')

    # Convert to pandas if required and using Polars
    if as_pandas and isinstance(df, pl.DataFrame):
        df = df.to_pandas()

    return df

In [14]:
df = merge_n_case_ids(
    n_ids=1000,
    data_dir='D:/Kaggle_Credit_Risk_Predictions/new_aggs/new_aggs',
    path_to_base='D:/Kaggle_Credit_Risk_Predictions/parquet_files/train/train_base.parquet',
    use_0=True,
    as_pandas=True,
    random_state=28  
)

In [15]:
#case_ids_list = list(df['case_id'])

df_rest = merge_n_case_ids(
    n_ids=1000,
    data_dir=data_dir,
    path_to_base=base_file,
    use_0=False,
    as_pandas=True,
    random_state=28
)

In [17]:
df = pd.merge(df, df_rest, how='left', on='case_id', suffixes=("", "_r"))

In [18]:
df.head()

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpd_943P_min,annuity_853A_min,byoccupationinc_3656910L_min,childnum_21L_min,credacc_actualbalance_314A_min,...,employername_160M_binary_7_r,employername_160M_binary_8_r,employername_160M_binary_9_r,employername_160M_binary_10_r,employername_160M_binary_11_r,employername_160M_binary_12_r,employername_160M_binary_13_r,employername_160M_binary_14_r,employername_160M_binary_15_r,employername_160M_binary_16_r
0,4927,2019-02-15,201902,6,1,,,,,,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,7449,2019-03-11,201903,9,0,,,,,,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,7786,2019-03-12,201903,10,0,0.0,1952.8,,,,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
3,8275,2019-03-16,201903,10,0,,,,,,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
4,10950,2019-04-09,201904,14,0,,,,,,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [19]:
df.dtypes.value_counts()

float64    2495
int8         99
object       30
uint32        8
int64         7
int32         2
Name: count, dtype: int64

In [21]:
df.shape

(1000, 2641)

In [22]:
# Drop empty columns
df = df.dropna(how='all', axis=1)

In [23]:
df.shape

(1000, 2524)

In [24]:
# Extract date info from base
df['date_decision'] = pd.to_datetime(df['date_decision'])
df['dec_day'] = df['date_decision'].dt.day
df['dec_month'] = df['date_decision'].dt.month
df['dec_year'] = df['date_decision'].dt.year

# Drop redundant information
df = df.drop(columns=[
    'MONTH', 'MONTH_r', 'WEEK_NUM', 'WEEK_NUM_r', 
    'date_decision', 'date_decision_r','case_id',
    'target_r'
])

In [25]:
# Get bool cols with na
na_bool_cols = df.select_dtypes(include=['O']).columns

# Remove redundant
na_bool_cols = [ col for col in na_bool_cols if col.__contains__('min') ]
df = df.drop(columns=na_bool_cols)

In [26]:
# Create null flags
na_cols = df.columns[df.isna().any()].to_list()
isna_cols = [ col + '_isna' for col in na_cols ]
na_df = df[na_cols].isna()
na_df.columns = isna_cols

df = pd.concat([df, na_df], axis=1)

# Free memory
del na_df

In [27]:
# Drop columns below treshold
df = df.dropna(thresh=len(df)//10, axis=1)

## Train Test Split

In [28]:
X = df.drop(columns='target')
y = df[['target']]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=28)

# Free memory
del df, X, y

In [29]:
X_train.dtypes.value_counts()

bool       2354
float64    1595
int8         99
object        8
uint32        8
int32         5
Name: count, dtype: int64

In [30]:
# Fill na from X_train median and mode
X_train[X_train.select_dtypes(exclude=['O', 'bool']).columns] = X_train.select_dtypes(exclude=['O', 'bool']).fillna(X_train.median())
X_test[X_test.select_dtypes(exclude=['O', 'bool']).columns] = X_test.select_dtypes(exclude=['O', 'bool']).fillna(X_train.median())
X_train[X_train.select_dtypes(include=['O', 'bool']).columns] = X_train.select_dtypes(include=['O', 'bool']).fillna(X_train.mode())
X_test[X_test.select_dtypes(include=['O', 'bool']).columns] = X_test.select_dtypes(include=['O', 'bool']).fillna(X_train.mode())

In [31]:
remain_na_cols = X_train.columns[X_train.isna().any()].to_list()
for col in remain_na_cols:
    X_train[col] = X_train[col].fillna(X_train[col].mode()[0])
    X_test[col] = X_test[col].fillna(X_train[col].mode()[0])

## Handle imbalanced data

In [34]:
sm = SMOTE(random_state=28)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)