## Model Building - Random Forest

In [35]:
import polars as pl
import pandas as pd
import lightgbm as lgb
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
import os
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

## Extract 10000 case_id's for num_group = 0.

In [1]:
def merge_n_case_ids(
    n_ids: int = 5000,
    data_dir: str = '/kaggle/input/cr-newagg/new_aggs',
    path_to_base: str = '/kaggle/input/datasets-cr/train_base.parquet',
    use_0: bool = True,
    as_pandas: bool = True,
    random_state: int = 28
) -> pl.DataFrame | pd.DataFrame:
    '''
    Function to merge selected case_id from parquet files, returns subset case_id from the merged dataset.

    Parameters
    ----------
    n_ids : Number of case_ids to sample initially and return (int)
    data_dir : Path to processed parquet files directory (str)
    path_to_base : Path to base file (str)
    use_0 : Use num_group1 == 0 (bool)
    as_pandas : Return as pandas DataFrame
    random_seed : Random seed (int)
    '''
    # Read the base dataframe and sample case_ids
    if as_pandas:
        base_df = pd.read_parquet(path_to_base)
        case_ids = base_df['case_id'].sample(n=n_ids, replace=False, random_state=random_state).tolist()
    else:
        base_df = pl.read_parquet(path_to_base)
        case_ids = base_df['case_id'].sample(n=n_ids, replace=False, seed=random_state).to_list()

    # Define the file pattern for fetching files
    file_pattern = '*grouped_0.parquet' if use_0 else '*grouped_rest.parquet'
    file_paths = glob(data_dir + '/' + file_pattern)

    # Initialize the merged DataFrame filtering the base DataFrame
    if as_pandas:
        df = base_df[base_df['case_id'].isin(case_ids)]
    else:
        df = base_df.filter(pl.col('case_id').is_in(case_ids))

    # Merge DataFrames with only the selected case_ids
    for path in file_paths:
        if as_pandas:
            temp = pd.read_parquet(path)
            temp = temp[temp['case_id'].isin(case_ids)]
            df = pd.merge(df, temp, on='case_id', how='outer')
        else:
            temp = pl.read_parquet(path)
            temp = temp.filter(pl.col('case_id').is_in(case_ids))
            df = df.join(temp, on='case_id', how='outer')

    # Convert to pandas if required and using Polars
    if as_pandas and isinstance(df, pl.DataFrame):
        df = df.to_pandas()

    return df

In [2]:
df = merge_n_case_ids(
    n_ids=5000,
    data_dir='/kaggle/input/cr-newagg/new_aggs',
    path_to_base='/kaggle/input/datasets-cr/train_base.parquet',
    use_0=True,
    as_pandas=True,
    random_state=28  
)

In [7]:
df

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,amount_4527230A_min,amount_4527230A_max,amount_4527230A_mean,amount_4527230A_median,amount_4527230A_sum,...,last30dayturnover_651A_sum,openingdate_857D_min,openingdate_857D_max,openingdate_857D_distinct,openingdate_857D_min_year,openingdate_857D_min_month,openingdate_857D_min_day,openingdate_857D_max_year,openingdate_857D_max_month,openingdate_857D_max_day
0,956,2019-01-12,201901,1,0,,,,,,...,,,,,,,,,,
1,1004,2019-01-13,201901,1,0,,,,,,...,,,,,,,,,,
2,1043,2019-01-14,201901,1,0,,,,,,...,,,,,,,,,,
3,1135,2019-01-14,201901,1,0,,,,,,...,,,,,,,,,,
4,1152,2019-01-14,201901,1,0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2702200,2020-09-29,202009,91,0,,,,,,...,,,,,,,,,,
4996,2702748,2020-10-01,202010,91,0,,,,,,...,0.0,2015-02-25,2015-02-25,1.0,2015.0,2.0,25.0,2015.0,2.0,25.0
4997,2702781,2020-10-02,202010,91,0,,,,,,...,,,,,,,,,,
4998,2703191,2020-10-05,202010,91,0,,,,,,...,,,,,,,,,,


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Columns: 1424 entries, case_id to openingdate_857D_max_day
dtypes: float64(1230), int32(2), int64(16), int8(99), object(69), uint32(8)
memory usage: 50.8+ MB


In [4]:
print(df.columns)

Index(['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target',
       'amount_4527230A_min', 'amount_4527230A_max', 'amount_4527230A_mean',
       'amount_4527230A_median', 'amount_4527230A_sum',
       ...
       'last30dayturnover_651A_sum', 'openingdate_857D_min',
       'openingdate_857D_max', 'openingdate_857D_distinct',
       'openingdate_857D_min_year', 'openingdate_857D_min_month',
       'openingdate_857D_min_day', 'openingdate_857D_max_year',
       'openingdate_857D_max_month', 'openingdate_857D_max_day'],
      dtype='object', length=1424)


In [20]:
# Get columns with data type 'object'
object_cols = df.select_dtypes(include=['object']).columns.tolist()

# Print the list of object columns
print("Object type columns:", object_cols)

Object type columns: ['date_decision', 'recorddate_4527225D_min', 'recorddate_4527225D_max', 'dateofcredend_289D_min', 'dateofcredend_353D_min', 'dateofcredstart_181D_min', 'dateofcredstart_739D_min', 'dateofrealrepmt_138D_min', 'lastupdate_1112D_min', 'lastupdate_388D_min', 'numberofoverdueinstlmaxdat_148D_min', 'numberofoverdueinstlmaxdat_641D_min', 'overdueamountmax2date_1002D_min', 'overdueamountmax2date_1142D_min', 'refreshdate_3813885D_min', 'dateofcredend_289D_max', 'dateofcredend_353D_max', 'dateofcredstart_181D_max', 'dateofcredstart_739D_max', 'dateofrealrepmt_138D_max', 'lastupdate_1112D_max', 'lastupdate_388D_max', 'numberofoverdueinstlmaxdat_148D_max', 'numberofoverdueinstlmaxdat_641D_max', 'overdueamountmax2date_1002D_max', 'overdueamountmax2date_1142D_max', 'refreshdate_3813885D_max', 'contractenddate_991D_min', 'openingdate_313D_min', 'contractenddate_991D_max', 'openingdate_313D_max', 'pmts_date_1107D_min', 'pmts_date_1107D_max', 'approvaldate_319D_min', 'creationdate_

In [22]:
object_cols = ['recorddate_4527225D_min', 'recorddate_4527225D_max', 'dateofcredend_289D_min', 'dateofcredend_353D_min', 'dateofcredstart_181D_min', 'dateofcredstart_739D_min', 
               'dateofrealrepmt_138D_min', 'lastupdate_1112D_min', 
               'lastupdate_388D_min', 'numberofoverdueinstlmaxdat_148D_min', 'numberofoverdueinstlmaxdat_641D_min', 'overdueamountmax2date_1002D_min', 'overdueamountmax2date_1142D_min', 'refreshdate_3813885D_min', 
               'dateofcredend_289D_max', 'dateofcredend_353D_max', 'dateofcredstart_181D_max', 'dateofcredstart_739D_max', 'dateofrealrepmt_138D_max', 'lastupdate_1112D_max', 'lastupdate_388D_max', 
               'numberofoverdueinstlmaxdat_148D_max', 'numberofoverdueinstlmaxdat_641D_max', 'overdueamountmax2date_1002D_max', 'overdueamountmax2date_1142D_max', 'refreshdate_3813885D_max', 
               'contractenddate_991D_min', 'openingdate_313D_min', 
               'contractenddate_991D_max', 'openingdate_313D_max', 'pmts_date_1107D_min', 'pmts_date_1107D_max', 'approvaldate_319D_min', 'creationdate_885D_min', 
               'dateactivated_425D_min', 'dtlastpmt_581D_min', 'dtlastpmtallstes_3545839D_min', 'employedfrom_700D_min', 'firstnonzeroinstldate_307D_min', 'approvaldate_319D_max', 
               'creationdate_885D_max', 'dateactivated_425D_max', 'dtlastpmt_581D_max', 'dtlastpmtallstes_3545839D_max', 'employedfrom_700D_max', 'firstnonzeroinstldate_307D_max', 
               'empls_employedfrom_796D_min', 'empls_employedfrom_796D_max', 'processingdate_168D_min', 'processingdate_168D_max', 'contractdate_551D_min', 'contractmaturitydate_151D_min', 
               'lastupdate_260D_min', 'contractdate_551D_max', 'contractmaturitydate_151D_max', 'lastupdate_260D_max', 'birth_259D_min', 'birthdate_87D_min', 'empl_employedfrom_271D_min', 
               'birth_259D_max', 'birthdate_87D_max', 'empl_employedfrom_271D_max', 'deductiondate_4917603D_min', 'deductiondate_4917603D_max', 'openingdate_857D_min', 'openingdate_857D_max']

# Drop the object columns from the DataFrame
df.drop(columns=object_cols, inplace=True)

# Print the new DataFrame information to confirm columns are deleted
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Columns: 1358 entries, case_id to openingdate_857D_max_day
dtypes: float64(1230), int32(2), int64(18), int8(99), object(1), uint32(8)
memory usage: 48.3+ MB
None


In [23]:
print(df.columns.tolist())

['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target', 'amount_4527230A_min', 'amount_4527230A_max', 'amount_4527230A_mean', 'amount_4527230A_median', 'amount_4527230A_sum', 'recorddate_4527225D_distinct', 'recorddate_4527225D_min_year', 'recorddate_4527225D_min_month', 'recorddate_4527225D_min_day', 'recorddate_4527225D_max_year', 'recorddate_4527225D_max_month', 'recorddate_4527225D_max_day', 'name_4527232M_freq', 'name_4527232M_binary_0', 'name_4527232M_binary_1', 'name_4527232M_binary_2', 'name_4527232M_binary_3', 'name_4527232M_binary_4', 'name_4527232M_binary_5', 'name_4527232M_binary_6', 'name_4527232M_binary_7', 'name_4527232M_binary_8', 'name_4527232M_binary_9', 'name_4527232M_binary_10', 'name_4527232M_binary_11', 'name_4527232M_binary_12', 'name_4527232M_binary_13', 'name_4527232M_binary_14', 'name_4527232M_binary_15', 'name_4527232M_binary_16', 'annualeffectiverate_199L_min', 'annualeffectiverate_63L_min', 'contractsum_5085717L_min', 'credlmt_230A_min', 'credlmt_935A_m

In [25]:
# Convert 'date_decision' from object (string) to 'datetime64[ns]'
df['date_decision'] = pd.to_datetime(df['date_decision'])

# Extract day, month, and year into separate columns with specific names
df['date_decision_year'] = df['date_decision'].dt.year
df['date_decision_month'] = df['date_decision'].dt.month
df['date_decision_day'] = df['date_decision'].dt.day

# Display the DataFrame to see the changes
print(df)

      case_id date_decision   MONTH  WEEK_NUM  target  amount_4527230A_min  \
0         956    2019-01-12  201901         1       0                  NaN   
1        1004    2019-01-13  201901         1       0                  NaN   
2        1043    2019-01-14  201901         1       0                  NaN   
3        1135    2019-01-14  201901         1       0                  NaN   
4        1152    2019-01-14  201901         1       0                  NaN   
...       ...           ...     ...       ...     ...                  ...   
4995  2702200    2020-09-29  202009        91       0                  NaN   
4996  2702748    2020-10-01  202010        91       0                  NaN   
4997  2702781    2020-10-02  202010        91       0                  NaN   
4998  2703191    2020-10-05  202010        91       0                  NaN   
4999  2703348    2020-10-05  202010        91       0                  NaN   

      amount_4527230A_max  amount_4527230A_mean  amount_4527230

In [27]:
df.drop(columns='date_decision', inplace=True)


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Columns: 1360 entries, case_id to date_decision_day
dtypes: float64(1230), int32(5), int64(18), int8(99), uint32(8)
memory usage: 48.3 MB


## Model Building

In [30]:
# Count the occurrences of each unique value in 'target'
value_counts = df['target'].value_counts().reset_index()
value_counts.columns = ['target', 'count']

# Calculate the total number of rows in the DataFrame
total_count = df.shape[0]  # This returns the number of rows

# Calculate the proportion of each 'target' value
value_counts['proportion'] = value_counts['count'] / total_count

# Print the result
print(value_counts)

   target  count  proportion
0       0   4834      0.9668
1       1    166      0.0332


The ratio of fraud to non-fraud observations is very low. Need to use techniques such as oversampling the minority class, undersampling the majority class, or using synthetic data generation methods like SMOTE (Synthetic Minority Over-sampling Technique) to balance the classes before model training.

In [33]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target'] 

# You can check the content of X and y to confirm
print(X.head()) 
print(y.head())

   case_id   MONTH  WEEK_NUM  amount_4527230A_min  amount_4527230A_max  \
0      956  201901         1                  NaN                  NaN   
1     1004  201901         1                  NaN                  NaN   
2     1043  201901         1                  NaN                  NaN   
3     1135  201901         1                  NaN                  NaN   
4     1152  201901         1                  NaN                  NaN   

   amount_4527230A_mean  amount_4527230A_median  amount_4527230A_sum  \
0                   NaN                     NaN                  NaN   
1                   NaN                     NaN                  NaN   
2                   NaN                     NaN                  NaN   
3                   NaN                     NaN                  NaN   
4                   NaN                     NaN                  NaN   

   recorddate_4527225D_distinct  recorddate_4527225D_min_year  ...  \
0                           NaN                     

In [36]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Original data count
original_counts = y_pd.value_counts()

# Resampled data count
resampled_counts = pd.Series(y_res).value_counts()

# Plotting the results
fig, ax = plt.subplots(1, 2, figsize=(16, 7), sharey=True)

ax[0].bar(original_counts.index.astype(str), original_counts.values, color=['blue', 'red'])
ax[0].set_title('Original Data Distribution')
ax[0].set_ylabel('Number of Instances')
ax[0].set_xlabel('Target Class')

ax[1].bar(resampled_counts.index.astype(str), resampled_counts.values, color=['blue', 'red'])
ax[1].set_title('SMOTE Data Distribution')
ax[1].set_xlabel('Target Class')

plt.suptitle('Class Distribution Before and After SMOTE')
plt.tight_layout()
plt.show()

In [7]:
case_ids_train, case_ids_val = train_test_split(case_ids, train_size=0.8, random_state=28)
X_train, y_train = case_ids_to_df(df, case_ids_train)
X_val, y_val = case_ids_to_df(df, case_ids_val)

assert X_train.shape[0] + X_val.shape[0] == df.shape[0]