### Preprocessing for Date columns

In [1]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
import polars as pl

In [30]:
# Load a Parquet file into a Polars DataFrame
df2 = pl.read_parquet("new_aggs/new_aggs/train_applprev_1_grouped_0.parquet")

In [31]:
df2.head()

case_id,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,childnum_21L,creationdate_885D,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,credacc_status_367L,credacc_transactions_402L,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,education_1138M,employedfrom_700D,familystate_726L,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,isdebitcard_527L,mainoccupationinc_437A,maxdpdtolerance_577P,num_group1,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,revolvingaccount_394A,status_219L,tenor_203L
i64,f64,f64,str,f64,str,f64,str,f64,f64,f64,f64,str,f64,f64,str,f64,str,str,f64,str,str,str,str,str,str,str,bool,bool,f64,f64,i64,f64,f64,str,str,str,str,f64,str,f64
2,0.0,640.2,,,"""a55475b1""",0.0,"""2013-04-03""",,0.0,,,,,10000.0,"""CAL""",,,"""P136_108_173""",0.0,,,"""P97_36_170""","""2010-02-15""","""SINGLE""","""2013-05-04""","""CASH""",False,,8200.0,,0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",24.0
2,0.0,1682.4,,,"""a55475b1""",0.0,"""2013-04-03""",,0.0,,,,,16000.0,"""CAL""",,,"""P136_108_173""",0.0,,,"""P97_36_170""","""2010-02-15""","""SINGLE""","""2013-05-04""","""CASH""",False,,8200.0,,1,,12.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",12.0
3,0.0,6140.0,,,"""P94_109_143""",,"""2019-01-07""",,0.0,,,,,59999.8,"""CAL""",,,"""P131_33_167""",0.0,,,"""P97_36_170""","""2018-05-15""","""MARRIED""","""2019-02-07""","""CASH""",False,,11000.0,,0,,12.0,"""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""",,"""D""",12.0
4,0.0,2556.6,,,"""P24_27_36""",,"""2019-01-08""",,0.0,,,,,40000.0,"""CAL""",,,"""P194_82_174""",0.0,,,"""a55475b1""",,,"""2019-02-08""","""CASH""",False,,16000.0,,0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""T""",24.0
5,0.0,,,,"""P85_114_140""",,"""2019-01-16""",,,,,,,,,,,"""P54_133_26""",,,,"""a55475b1""",,,,,False,,62000.0,,0,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""T""",


In [32]:
date_columns = ['approvaldate_319D', 'creationdate_885D', 'dateactivated_425D', 
                'dtlastpmt_581D', 'dtlastpmtallstes_3545839D', 'employedfrom_700D', 
                'firstnonzeroinstldate_307D']

# Convert columns to date format using with_columns()
df2 = df2.with_columns(
    [pl.col(column).str.strptime(pl.Date, "%Y-%m-%d").alias(column) for column in date_columns]
)

# Check the data types after conversion using the schema property
print(df2.schema)

OrderedDict([('case_id', Int64), ('actualdpd_943P', Float64), ('annuity_853A', Float64), ('approvaldate_319D', Date), ('byoccupationinc_3656910L', Float64), ('cancelreason_3545846M', String), ('childnum_21L', Float64), ('creationdate_885D', Date), ('credacc_actualbalance_314A', Float64), ('credacc_credlmt_575A', Float64), ('credacc_maxhisbal_375A', Float64), ('credacc_minhisbal_90A', Float64), ('credacc_status_367L', String), ('credacc_transactions_402L', Float64), ('credamount_590A', Float64), ('credtype_587L', String), ('currdebt_94A', Float64), ('dateactivated_425D', Date), ('district_544M', String), ('downpmt_134A', Float64), ('dtlastpmt_581D', Date), ('dtlastpmtallstes_3545839D', Date), ('education_1138M', String), ('employedfrom_700D', Date), ('familystate_726L', String), ('firstnonzeroinstldate_307D', Date), ('inittransactioncode_279L', String), ('isbidproduct_390L', Boolean), ('isdebitcard_527L', Boolean), ('mainoccupationinc_437A', Float64), ('maxdpdtolerance_577P', Float64), 

### For each date column, separate the days, month and year in different columns

In [33]:
df2 = df2.with_columns(
    [
        item
        for column in date_columns
        for item in [
            pl.col(column).dt.year().alias(column + '_year'),
            pl.col(column).dt.month().alias(column + '_month'),
            pl.col(column).dt.day().alias(column + '_day'),
        ]
    ]
)

In [34]:
# Drop existing date columns
df2 = df2.drop(date_columns)

In [35]:
# Print the schema to verify types
print(df2.schema)

OrderedDict([('case_id', Int64), ('actualdpd_943P', Float64), ('annuity_853A', Float64), ('byoccupationinc_3656910L', Float64), ('cancelreason_3545846M', String), ('childnum_21L', Float64), ('credacc_actualbalance_314A', Float64), ('credacc_credlmt_575A', Float64), ('credacc_maxhisbal_375A', Float64), ('credacc_minhisbal_90A', Float64), ('credacc_status_367L', String), ('credacc_transactions_402L', Float64), ('credamount_590A', Float64), ('credtype_587L', String), ('currdebt_94A', Float64), ('district_544M', String), ('downpmt_134A', Float64), ('education_1138M', String), ('familystate_726L', String), ('inittransactioncode_279L', String), ('isbidproduct_390L', Boolean), ('isdebitcard_527L', Boolean), ('mainoccupationinc_437A', Float64), ('maxdpdtolerance_577P', Float64), ('num_group1', Int64), ('outstandingdebt_522A', Float64), ('pmtnum_8L', Float64), ('postype_4733339M', String), ('profession_152M', String), ('rejectreason_755M', String), ('rejectreasonclient_4145042M', String), ('rev