### Data Preprocessing

Dealing with the categorical features and tried out some transformations on a previous application 1 file which is grouped by num_group1=0.


**One-hot encoding:** It is used when the categorical feature is not ordinal (i.e., the categories do not have any order) and when the number of categorical values is low enough to handle the increase in feature space.

**Frequency encoding:** Frequency encoding replaces each category with the frequency or count of that category in the dataset. This method groups the categories by their frequencies.

**Label encoding:** It is effective when the frequency of categories is important for the prediction or when dealing with a high number of categories where one-hot encoding might lead to memory issues.

In [9]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
import polars as pl

In [10]:
df1 = pd.read_parquet("new_aggs/new_aggs/train_applprev_1_grouped_0.parquet")

In [14]:
# Load a Parquet file into a Polars DataFrame
df2 = pl.read_parquet("new_aggs/new_aggs/train_applprev_1_grouped_0.parquet")

In [15]:
df2.head()

case_id,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,childnum_21L,creationdate_885D,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,credacc_status_367L,credacc_transactions_402L,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,education_1138M,employedfrom_700D,familystate_726L,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,isdebitcard_527L,mainoccupationinc_437A,maxdpdtolerance_577P,num_group1,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,revolvingaccount_394A,status_219L,tenor_203L
i64,f64,f64,str,f64,str,f64,str,f64,f64,f64,f64,str,f64,f64,str,f64,str,str,f64,str,str,str,str,str,str,str,bool,bool,f64,f64,i64,f64,f64,str,str,str,str,f64,str,f64
2,0.0,640.2,,,"""a55475b1""",0.0,"""2013-04-03""",,0.0,,,,,10000.0,"""CAL""",,,"""P136_108_173""",0.0,,,"""P97_36_170""","""2010-02-15""","""SINGLE""","""2013-05-04""","""CASH""",False,,8200.0,,0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",24.0
2,0.0,1682.4,,,"""a55475b1""",0.0,"""2013-04-03""",,0.0,,,,,16000.0,"""CAL""",,,"""P136_108_173""",0.0,,,"""P97_36_170""","""2010-02-15""","""SINGLE""","""2013-05-04""","""CASH""",False,,8200.0,,1,,12.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",12.0
3,0.0,6140.0,,,"""P94_109_143""",,"""2019-01-07""",,0.0,,,,,59999.8,"""CAL""",,,"""P131_33_167""",0.0,,,"""P97_36_170""","""2018-05-15""","""MARRIED""","""2019-02-07""","""CASH""",False,,11000.0,,0,,12.0,"""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""",,"""D""",12.0
4,0.0,2556.6,,,"""P24_27_36""",,"""2019-01-08""",,0.0,,,,,40000.0,"""CAL""",,,"""P194_82_174""",0.0,,,"""a55475b1""",,,"""2019-02-08""","""CASH""",False,,16000.0,,0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""T""",24.0
5,0.0,,,,"""P85_114_140""",,"""2019-01-16""",,,,,,,,,,,"""P54_133_26""",,,,"""a55475b1""",,,,,False,,62000.0,,0,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""T""",


In [4]:
df1.shape

(6525979, 41)

### Convert the date columns to date datatype and then separate the days, month and year columns

In [5]:
# List of columns to convert to date format
date_columns = ['approvaldate_319D', 'creationdate_885D', 'dateactivated_425D', 
                'dtlastpmt_581D', 'dtlastpmtallstes_3545839D', 'employedfrom_700D', 
                'firstnonzeroinstldate_307D']

# Convert columns to date format
for column in date_columns:
    df1[column] = pd.to_datetime(df1[column])

# Check the data types after conversion
print(df1[date_columns].dtypes)

approvaldate_319D             datetime64[ns]
creationdate_885D             datetime64[ns]
dateactivated_425D            datetime64[ns]
dtlastpmt_581D                datetime64[ns]
dtlastpmtallstes_3545839D     datetime64[ns]
employedfrom_700D             datetime64[ns]
firstnonzeroinstldate_307D    datetime64[ns]
dtype: object


In [6]:
date_features = ['approvaldate_319D', 'creationdate_885D', 'dateactivated_425D', 
                'dtlastpmt_581D', 'dtlastpmtallstes_3545839D', 'employedfrom_700D', 
                'firstnonzeroinstldate_307D']
for feature in date_features:
    df1[feature] = pd.to_datetime(df1[feature])
    df1[feature + '_year'] = df1[feature].dt.year
    df1[feature + '_month'] = df1[feature].dt.month
    df1[feature + '_day'] = df1[feature].dt.day

datetime_cols = df1.select_dtypes(include=['datetime']).columns
df1.drop(columns=datetime_cols, inplace=True)


### Identify Categorical Columns

In [7]:
categorical_cols = df1.select_dtypes(include=['object']).columns.tolist()
categorical_cols

['cancelreason_3545846M',
 'credacc_status_367L',
 'credtype_587L',
 'district_544M',
 'education_1138M',
 'familystate_726L',
 'inittransactioncode_279L',
 'isbidproduct_390L',
 'isdebitcard_527L',
 'postype_4733339M',
 'profession_152M',
 'rejectreason_755M',
 'rejectreasonclient_4145042M',
 'status_219L']

### One-Hot Encoding for Low Cardinality Features

In [8]:
# Ensure you're working on df1 throughout
onehot_encoder = OneHotEncoder(sparse_output=False)
low_card_features = [
    'credacc_status_367L', 'credtype_587L', 'isbidproduct_390L',
    'isdebitcard_527L', 'status_219L', 'education_1138M',
    'familystate_726L', 'inittransactioncode_279L', 'postype_4733339M'
]

# Fit and transform
onehot_encoded_data = onehot_encoder.fit_transform(df1[low_card_features])

# Create a DataFrame with the encoded data
onehot_encoded_df = pd.DataFrame(onehot_encoded_data, columns=onehot_encoder.get_feature_names_out(low_card_features))

# Update df1 by dropping the original columns and concatenating the new one-hot encoded columns
df1 = pd.concat([df1.drop(columns=low_card_features), onehot_encoded_df], axis=1)


### Frequency Encoding for Medium Cardinality Features

In [None]:
# Applying frequency encoding
for feature in ['district_544M', 'profession_152M']:
    frequency = df1[feature].value_counts()
    df1[feature] = df1[feature].map(frequency)

### Label Encoding for High Cardinality Features

In [None]:
label_encoder = LabelEncoder()
high_card_features = ['cancelreason_3545846M', 'rejectreason_755M', 'rejectreasonclient_4145042M']
for column in high_card_features:
    df1[column] = label_encoder.fit_transform(df1[column])


In [None]:
# Check data types to ensure no column is left as object due to categorical data
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6525979 entries, 0 to 6525978
Data columns (total 100 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   case_id                               int64  
 1   actualdpd_943P                        float64
 2   annuity_853A                          float64
 3   byoccupationinc_3656910L              float64
 4   cancelreason_3545846M                 int32  
 5   childnum_21L                          float64
 6   credacc_actualbalance_314A            float64
 7   credacc_credlmt_575A                  float64
 8   credacc_maxhisbal_375A                float64
 9   credacc_minhisbal_90A                 float64
 10  credacc_transactions_402L             float64
 11  credamount_590A                       float64
 12  currdebt_94A                          float64
 13  district_544M                         int64  
 14  downpmt_134A                          float64
 15  mainoccupation

In [None]:
df1.shape

(6525979, 100)

In [None]:
df1.head()

Unnamed: 0,case_id,actualdpd_943P,annuity_853A,byoccupationinc_3656910L,cancelreason_3545846M,childnum_21L,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,...,inittransactioncode_279L_None,postype_4733339M_P140_48_169,postype_4733339M_P149_40_170,postype_4733339M_P169_115_83,postype_4733339M_P177_117_192,postype_4733339M_P217_110_186,postype_4733339M_P46_145_78,postype_4733339M_P60_146_156,postype_4733339M_P67_102_161,postype_4733339M_a55475b1
0,2,0.0,640.2,,75,0.0,,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,0.0,1682.4,,75,0.0,,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,0.0,6140.0,,69,,,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,0.0,2556.6,,42,,,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,0.0,,,65,,,,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
