In [1]:
import polars as pl
import os

train_applprev_1_0 = pl.read_csv("Data/csv_files/train/train_applprev_1_0.csv")
train_applprev_1_1 = pl.read_csv("Data/csv_files/train/train_applprev_1_1.csv")

In [2]:
train_applprev_1 = pl.concat([train_applprev_1_0, train_applprev_1_1])

directory = r"C:\Users\afise\.git\CreditRiskModel\Merged_Data"
filename = "train_applprev_1.csv"
file_path = os.path.join(directory, filename)

In [3]:
unique_case_id_count = train_applprev_1.select(pl.col("case_id").unique().count())
print(unique_case_id_count)

shape: (1, 1)
┌─────────┐
│ case_id │
│ ---     │
│ u32     │
╞═════════╡
│ 1221522 │
└─────────┘


In [4]:
train_applprev_1.shape

(6525979, 41)

In [5]:
df_joined = train_applprev_1.lazy()

In [6]:
def try_parse_date(col, fmt1, fmt2):
    date1 = col.str.strptime(pl.Date, fmt1, strict=False)
    date2 = col.str.strptime(pl.Date, fmt2, strict=False)
    return pl.when(date1.is_not_null()).then(date1).otherwise(date2)

df_joined = (
    df_joined
    .with_columns([
        try_parse_date(pl.col("approvaldate_319D"), "%m/%d/%Y", "%Y-%m-%d").alias("approvaldate_319D"),
        try_parse_date(pl.col("dateactivated_425D"), "%m/%d/%Y", "%Y-%m-%d").alias("dateactivated_425D"),
        try_parse_date(pl.col("creationdate_885D"), "%m/%d/%Y", "%Y-%m-%d").alias("creationdate_885D"),
        try_parse_date(pl.col("dtlastpmt_581D"), "%m/%d/%Y", "%Y-%m-%d").alias("dtlastpmt_581D"),
        try_parse_date(pl.col("employedfrom_700D"), "%m/%d/%Y", "%Y-%m-%d").alias("employedfrom_700D"),
        try_parse_date(pl.col("dtlastpmtallstes_3545839D"), "%m/%d/%Y", "%Y-%m-%d").alias("dtlastpmtallstes_3545839D"),
        try_parse_date(pl.col("firstnonzeroinstldate_307D"), "%m/%d/%Y", "%Y-%m-%d").alias("firstnonzeroinstldate_307D"),
    ])
    .group_by("case_id")
    .agg([
        pl.col("actualdpd_943P").mean().alias("actualdpd_943P_mean"),
        pl.col("annuity_853A").sum().alias("annuity_853A_sum"),
        pl.col("childnum_21L").sum().alias("childnum_21L_sum"),
        pl.col("credacc_actualbalance_314A").mean().alias("credacc_actualbalance_314A_mean"),
        pl.col("credacc_credlmt_575A").mean().alias("credacc_credlmt_575A_mean"),
        pl.col("credacc_maxhisbal_375A").max().alias("credacc_maxhisbal_375A_max"),
        pl.col("credacc_minhisbal_90A").min().alias("credacc_minhisbal_90A_min"),
        pl.col("credacc_transactions_402L").sum().alias("credacc_transactions_402L_sum"),
        pl.col("credamount_590A").mean().alias("credamount_590A_mean"),
        pl.col("currdebt_94A").mean().alias("currdebt_94A_mean"),
        pl.col("downpmt_134A").sum().alias("downpmt_134A_sum"),
        pl.col("mainoccupationinc_437A").mean().alias("mainoccupationinc_437A_mean"),
        pl.col("outstandingdebt_522A").sum().alias("outstandingdebt_522A_sum"),
        pl.col("pmtnum_8L").max().alias("pmtnum_8L_max"),
        pl.col("tenor_203L").min().alias("tenor_203L_min"),
        pl.col("isbidproduct_390L").cast(pl.UInt32).sum().alias("isbidproduct_390L_sum"),
        pl.col("isdebitcard_527L").cast(pl.UInt32).sum().alias("isdebitcard_527L_sum"),
        pl.col("credacc_status_367L").n_unique().alias("credacc_status_367L_n_unique"),
        pl.col("credtype_587L").n_unique().alias("credtype_587L_n_unique"),
        pl.col("education_1138M").n_unique().alias("education_1138M_n_unique"),
        pl.col("familystate_726L").n_unique().alias("familystate_726L_n_unique"),
        pl.col("postype_4733339M").n_unique().alias("postype_4733339M_n_unique"),
        pl.col("profession_152M").n_unique().alias("profession_152M_n_unique"),
        pl.col("rejectreason_755M").n_unique().alias("rejectreason_755M_n_unique"),
        pl.col("rejectreasonclient_4145042M").n_unique().alias("rejectreasonclient_4145042M_n_unique"),
        pl.col("status_219L").n_unique().alias("status_219L_n_unique"),
        (pl.col("approvaldate_319D").diff().abs().min()).alias("approval_to_activation_min_diff"),
        (pl.col("creationdate_885D").diff().abs().min()).alias("creation_min_diff"),
        (pl.col("dtlastpmt_581D").diff().abs().max()).alias("payment_max_diff"),
        pl.col("employedfrom_700D").min().alias("earliest_employment_date"),
        pl.col("byoccupationinc_3656910L").n_unique().alias("byoccupationinc_3656910L_n_unique"),
        pl.col("cancelreason_3545846M").n_unique().alias("cancelreason_3545846M_n_unique"),
        pl.col("district_544M").n_unique().alias("district_544M_n_unique"),
        pl.col("dtlastpmtallstes_3545839D").min().alias("earliest_last_payment_date"),
        pl.col("firstnonzeroinstldate_307D").min().alias("earliest_first_nonzero_installment_date"),
        pl.col("inittransactioncode_279L").n_unique().alias("inittransactioncode_279L_n_unique"),
        pl.col("maxdpdtolerance_577P").max().alias("maximum_dpd_tolerance"),
        pl.col("revolvingaccount_394A").sum().alias("sum_revolving_accounts"),
    ])
    .collect()
)

In [7]:
df_joined.head()

case_id,actualdpd_943P_mean,annuity_853A_sum,childnum_21L_sum,credacc_actualbalance_314A_mean,credacc_credlmt_575A_mean,credacc_maxhisbal_375A_max,credacc_minhisbal_90A_min,credacc_transactions_402L_sum,credamount_590A_mean,currdebt_94A_mean,downpmt_134A_sum,mainoccupationinc_437A_mean,outstandingdebt_522A_sum,pmtnum_8L_max,tenor_203L_min,isbidproduct_390L_sum,isdebitcard_527L_sum,credacc_status_367L_n_unique,credtype_587L_n_unique,education_1138M_n_unique,familystate_726L_n_unique,postype_4733339M_n_unique,profession_152M_n_unique,rejectreason_755M_n_unique,rejectreasonclient_4145042M_n_unique,status_219L_n_unique,approval_to_activation_min_diff,creation_min_diff,payment_max_diff,earliest_employment_date,byoccupationinc_3656910L_n_unique,cancelreason_3545846M_n_unique,district_544M_n_unique,earliest_last_payment_date,earliest_first_nonzero_installment_date,inittransactioncode_279L_n_unique,maximum_dpd_tolerance,sum_revolving_accounts
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,duration[ms],duration[ms],duration[ms],date,u32,u32,u32,date,date,u32,f64,f64
182207,0.0,29913.6002,0.0,,0.0,,,0.0,26387.65,0.0,0.0,45250.0,0.0,16.0,3.0,0,0,1,2,2,2,1,1,2,2,3,15d,1d,114d,2018-01-15,1,3,2,2019-05-25,2018-05-01,2,0.0,0.0
877588,0.0,3330.4001,0.0,,55208.8,,,0.0,68527.533333,,0.0,86666.666667,0.0,16.0,16.0,0,0,1,2,2,2,1,1,3,2,2,,0ms,,2005-01-15,1,2,1,,2019-02-01,1,,0.0
1547933,0.0,2222.0,0.0,,0.0,,,0.0,10896.4,0.0,695.60004,39000.0,0.0,6.0,6.0,0,0,1,1,1,1,1,1,1,1,1,,,,2014-02-15,1,1,1,2019-10-03,2019-05-15,1,0.0,0.0
161640,0.0,5347.0,5.0,,0.0,,,0.0,8193.333333,0.0,0.0,30333.333333,0.0,6.0,4.0,0,0,1,2,2,2,1,1,2,2,2,,137d,,2007-01-15,1,2,1,2017-12-06,2017-09-08,2,1.0,0.0
176318,0.0,21445.6003,0.0,,0.0,,,0.0,96475.0,57664.906,500.0,44050.0,80393.2,36.0,24.0,0,0,1,2,2,4,1,1,1,1,3,,152d,,2006-04-15,2,2,2,2019-11-06,2008-09-07,2,0.0,0.0


In [8]:
df_joined.shape

(1221522, 39)

In [9]:
df_joined.describe()

statistic,case_id,actualdpd_943P_mean,annuity_853A_sum,childnum_21L_sum,credacc_actualbalance_314A_mean,credacc_credlmt_575A_mean,credacc_maxhisbal_375A_max,credacc_minhisbal_90A_min,credacc_transactions_402L_sum,credamount_590A_mean,currdebt_94A_mean,downpmt_134A_sum,mainoccupationinc_437A_mean,outstandingdebt_522A_sum,pmtnum_8L_max,tenor_203L_min,isbidproduct_390L_sum,isdebitcard_527L_sum,credacc_status_367L_n_unique,credtype_587L_n_unique,education_1138M_n_unique,familystate_726L_n_unique,postype_4733339M_n_unique,profession_152M_n_unique,rejectreason_755M_n_unique,rejectreasonclient_4145042M_n_unique,status_219L_n_unique,approval_to_activation_min_diff,creation_min_diff,payment_max_diff,earliest_employment_date,byoccupationinc_3656910L_n_unique,cancelreason_3545846M_n_unique,district_544M_n_unique,earliest_last_payment_date,earliest_first_nonzero_installment_date,inittransactioncode_279L_n_unique,maximum_dpd_tolerance,sum_revolving_accounts
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,str,str,f64,f64,f64
"""count""",1221522.0,1221505.0,1221522.0,1221522.0,253573.0,1219218.0,253573.0,253573.0,1221522.0,1219218.0,1107653.0,1221522.0,1220298.0,1221522.0,1205213.0,1205213.0,1221522.0,1221522.0,1221522.0,1221522.0,1221522.0,1221522.0,1221522.0,1221522.0,1221522.0,1221522.0,1221522.0,"""692445""","""991940""","""378717""","""967490""",1221522.0,1221522.0,1221522.0,"""977672""","""1191786""",1221522.0,1075690.0,1221522.0
"""null_count""",0.0,17.0,0.0,0.0,967949.0,2304.0,967949.0,967949.0,0.0,2304.0,113869.0,0.0,1224.0,0.0,16309.0,16309.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""529077""","""229582""","""842805""","""254032""",0.0,0.0,0.0,"""243850""","""29736""",0.0,145832.0,0.0
"""mean""",1426500.0,0.030591,17720.976509,2.046892,18546.031519,4041.530706,-2243.567787,-7196.173839,0.143494,38093.537901,6194.017659,2226.114053,41373.602041,24577.43702,23.048162,8.949868,0.29769,0.085311,1.208817,1.815662,1.823952,1.827529,1.449317,1.055198,1.741444,1.469942,2.143259,"""343 days, 10:1…","""213 days, 8:05…","""401 days, 22:5…","""2008-12-02""",1.767326,1.768969,1.387249,"""2018-04-07""","""2014-07-25""",1.709584,42.158345,182330000.0
"""std""",727480.516437,7.963274,18651.611357,3.612884,25937.474619,11922.735759,30575.874422,18648.737517,1.826906,26292.487333,13294.102219,7716.459315,23704.035719,55767.475314,12.825517,6.454601,0.715703,0.332567,0.443788,0.818038,0.625126,0.728848,0.881747,0.248796,0.942567,0.639868,0.974922,,,,,1.159203,0.934977,0.609845,,,0.780494,251.051895,391650000.0
"""min""",2.0,0.0,0.0,0.0,-134008.42,0.0,-199950.0,-350532.6,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"""0:00:00""","""0:00:00""","""0:00:00""","""1961-09-15""",1.0,1.0,1.0,"""2008-07-04""","""2006-01-26""",1.0,0.0,0.0
"""25%""",948908.0,0.0,4924.0,0.0,2.0,0.0,0.0,-2116.2,0.0,20048.0,0.0,0.0,26086.568421,0.0,12.0,5.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"""74 days, 0:00:…","""0:00:00""","""200 days, 0:00…","""2005-08-15""",1.0,1.0,1.0,"""2017-03-13""","""2012-01-29""",1.0,0.0,0.0
"""50%""",1509927.0,0.0,11518.8001,0.0,9320.797,0.0,0.0,0.0,0.0,32177.75,0.0,0.0,37100.0,0.0,24.0,6.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,"""168 days, 0:00…","""19 days, 0:00:…","""364 days, 0:00…","""2010-09-15""",1.0,2.0,1.0,"""2018-05-30""","""2015-02-15""",2.0,0.0,0.0
"""75%""",1815307.0,0.0,24088.4001,3.0,28138.201,0.0,4.0,0.0,0.0,49144.633333,7033.728,1000.0,50666.666667,22636.0,30.0,12.0,0.0,0.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,3.0,"""342 days, 0:00…","""182 days, 0:00…","""554 days, 0:00…","""2014-05-15""",2.0,2.0,2.0,"""2019-05-02""","""2018-01-03""",2.0,5.0,0.0
"""max""",2703454.0,4206.0,484046.01,98.0,2540730.0,400000.0,7988198.5,199567.0,310.0,400000.0,339996.38,1435102.0,199600.0,1275900.0,63.0,60.0,14.0,15.0,5.0,4.0,5.0,6.0,7.0,8.0,8.0,7.0,6.0,"""5197 days, 0:0…","""5356 days, 0:0…","""3762 days, 0:0…","""2020-07-15""",14.0,10.0,8.0,"""2020-10-19""","""2020-11-19""",4.0,4362.0,9965100000.0


In [10]:
train_other_1 = pl.read_csv("Data/csv_files/train/train_other_1.csv")

In [11]:
unique_case_id_count = train_other_1.select(pl.col("case_id").unique().count())
print(unique_case_id_count)

shape: (1, 1)
┌─────────┐
│ case_id │
│ ---     │
│ u32     │
╞═════════╡
│ 51109   │
└─────────┘


In [12]:
train_other_1.shape

(51109, 7)

In [13]:
df_lazy_2 = pl.scan_csv("Data/csv_files/train/train_other_1.csv")

In [14]:
df_lazy_2 = (
    df_lazy_2.group_by("case_id")
    .agg([
        pl.col("amtdebitincoming_4809443A").sum().alias("sum_amtdebitincoming"),
        pl.col("amtdebitoutgoing_4809440A").sum().alias("sum_amtdebitoutgoing"),
        pl.col("amtdepositbalance_4809441A").mean().alias("avg_amtdepositbalance"),
        pl.col("amtdepositincoming_4809444A").sum().alias("sum_amtdepositincoming"),
        pl.col("amtdepositoutgoing_4809442A").sum().alias("sum_amtdepositoutgoing"),
    ])
)

df_lazy_2 = df_lazy_2.collect()

In [15]:
df_lazy_2.head()

case_id,sum_amtdebitincoming,sum_amtdebitoutgoing,avg_amtdepositbalance,sum_amtdepositincoming,sum_amtdepositoutgoing
i64,f64,f64,f64,f64,f64
196489,2684.8,2650.0,0.0,0.0,0.0
215237,38666.6,38666.6,0.0,0.0,0.0
1814561,30000.0,30000.0,0.0,0.0,0.0
1844837,9634.601,9634.601,0.0,0.0,0.0
210390,10000.0,10000.0,0.0,0.0,0.0


In [16]:
df_joined = df_joined.lazy()
df_lazy_2 = df_lazy_2.lazy()

df_joined = df_joined.join(
    df_lazy_2,
    on="case_id",
    how="left"
)

df_joined = df_joined.collect()

In [17]:
df_joined.shape

(1221522, 44)

In [18]:
train_deposit_1 = pl.read_csv("Data/csv_files/train/train_deposit_1.csv")

In [19]:
train_deposit_1.shape

(145086, 5)

In [20]:
unique_case_id_count = train_deposit_1.select(pl.col("case_id").unique().count())
print(unique_case_id_count)

shape: (1, 1)
┌─────────┐
│ case_id │
│ ---     │
│ u32     │
╞═════════╡
│ 105111  │
└─────────┘


In [21]:
df_lazy_3 = pl.scan_csv("Data/csv_files/train/train_deposit_1.csv")

In [22]:
transformations = (
    df_lazy_3
    .group_by("case_id")
    .agg([
        pl.col("amount_416A").mean().alias("average_amount"),
        pl.count("openingdate_313D").alias("open_contracts_count"),
        pl.count("contractenddate_991D").alias("closed_contracts_count"),
    ])
)

df_lazy_3 = transformations.collect()

In [23]:
df_lazy_3.head()

case_id,average_amount,open_contracts_count,closed_contracts_count
i64,f64,u32,u32
122959,1189.837005,2,1
1715971,0.0,2,1
197296,1338.572,1,0
1263093,0.0,1,0
1392349,10231.9715,2,1


In [24]:
df_joined = df_joined.lazy()
df_lazy_3 = df_lazy_3.lazy()

df_joined = df_joined.join(
    df_lazy_3,
    on="case_id",
    how="left"
)

df_joined = df_joined.collect()

In [25]:
df_joined.head()

case_id,actualdpd_943P_mean,annuity_853A_sum,childnum_21L_sum,credacc_actualbalance_314A_mean,credacc_credlmt_575A_mean,credacc_maxhisbal_375A_max,credacc_minhisbal_90A_min,credacc_transactions_402L_sum,credamount_590A_mean,currdebt_94A_mean,downpmt_134A_sum,mainoccupationinc_437A_mean,outstandingdebt_522A_sum,pmtnum_8L_max,tenor_203L_min,isbidproduct_390L_sum,isdebitcard_527L_sum,credacc_status_367L_n_unique,credtype_587L_n_unique,education_1138M_n_unique,familystate_726L_n_unique,postype_4733339M_n_unique,profession_152M_n_unique,rejectreason_755M_n_unique,rejectreasonclient_4145042M_n_unique,status_219L_n_unique,approval_to_activation_min_diff,creation_min_diff,payment_max_diff,earliest_employment_date,byoccupationinc_3656910L_n_unique,cancelreason_3545846M_n_unique,district_544M_n_unique,earliest_last_payment_date,earliest_first_nonzero_installment_date,inittransactioncode_279L_n_unique,maximum_dpd_tolerance,sum_revolving_accounts,sum_amtdebitincoming,sum_amtdebitoutgoing,avg_amtdepositbalance,sum_amtdepositincoming,sum_amtdepositoutgoing,average_amount,open_contracts_count,closed_contracts_count
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,duration[ms],duration[ms],duration[ms],date,u32,u32,u32,date,date,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32
182207,0.0,29913.6002,0.0,,0.0,,,0.0,26387.65,0.0,0.0,45250.0,0.0,16.0,3.0,0,0,1,2,2,2,1,1,2,2,3,15d,1d,114d,2018-01-15,1,3,2,2019-05-25,2018-05-01,2,0.0,0.0,,,,,,,,
877588,0.0,3330.4001,0.0,,55208.8,,,0.0,68527.533333,,0.0,86666.666667,0.0,16.0,16.0,0,0,1,2,2,2,1,1,3,2,2,,0ms,,2005-01-15,1,2,1,,2019-02-01,1,,0.0,,,,,,,,
1547933,0.0,2222.0,0.0,,0.0,,,0.0,10896.4,0.0,695.60004,39000.0,0.0,6.0,6.0,0,0,1,1,1,1,1,1,1,1,1,,,,2014-02-15,1,1,1,2019-10-03,2019-05-15,1,0.0,0.0,,,,,,,,
161640,0.0,5347.0,5.0,,0.0,,,0.0,8193.333333,0.0,0.0,30333.333333,0.0,6.0,4.0,0,0,1,2,2,2,1,1,2,2,2,,137d,,2007-01-15,1,2,1,2017-12-06,2017-09-08,2,1.0,0.0,,,,,,,,
176318,0.0,21445.6003,0.0,,0.0,,,0.0,96475.0,57664.906,500.0,44050.0,80393.2,36.0,24.0,0,0,1,2,2,4,1,1,1,1,3,,152d,,2006-04-15,2,2,2,2019-11-06,2008-09-07,2,0.0,0.0,,,,,,,,


In [26]:
train_person_1 = pl.read_csv("Data/csv_files/train/train_person_1.csv")

In [27]:
train_person_1.shape

(2973991, 37)

In [28]:
unique_case_id_count = train_person_1.select(pl.col("case_id").unique().count())
print(unique_case_id_count)

shape: (1, 1)
┌─────────┐
│ case_id │
│ ---     │
│ u32     │
╞═════════╡
│ 1526659 │
└─────────┘


In [29]:
df_lazy_4 = pl.read_csv("Data/csv_files/train/train_person_1.csv")

In [30]:
from datetime import date

transformations_2 = [
    pl.col("birth_259D").n_unique().alias("unique_birth_dates"),
    pl.col("birthdate_87D").n_unique().alias("unique_birth_dates_87D"),
    pl.col("childnum_185L").max().alias("max_children"),
    pl.col("education_927M").n_unique().alias("unique_educations"),
    pl.col("empl_employedtotal_800L").mean().alias("avg_employment_length"),
    pl.col("mainoccupationinc_384A").sum().alias("total_main_income"),
    pl.col("gender_992L").n_unique().alias("unique_genders"),
    pl.col("housetype_905L").n_unique().alias("unique_house_types"),
    pl.col("housingtype_772L").n_unique().alias("unique_housing_types"),
    pl.col("incometype_1044T").n_unique().alias("unique_income_types"),
    pl.col("maritalst_703L").n_unique().alias("unique_marital_statuses"),
    pl.col("persontype_1072L").n_unique().alias("unique_person_types_1072L"),
    pl.col("persontype_792L").n_unique().alias("unique_person_types_792L"),
    pl.col("relationshiptoclient_415T").n_unique().alias("unique_relationships_415T"),
    pl.col("relationshiptoclient_642T").n_unique().alias("unique_relationships_642T"),
    pl.col("remitter_829L").sum().alias("sum_remitters"),
    pl.col("role_1084L").n_unique().alias("unique_roles_1084L"),
    pl.col("role_993L").n_unique().alias("unique_roles_993L"),
    pl.col("safeguarantyflag_411L").sum().alias("sum_safeguaranty_flags"),
    pl.col("sex_738L").n_unique().alias("unique_sexes"),
    pl.col("type_25L").n_unique().alias("unique_contact_types"),
    pl.col("contaddr_district_15M").n_unique().alias("unique_contact_address_districts"),
    pl.col("empladdr_district_926M").n_unique().alias("unique_employer_address_districts"),
    pl.col("registaddr_district_1083M").n_unique().alias("unique_registered_address_districts"),
    pl.col("isreference_387L").sum().alias("sum_is_reference_flags"),
    pl.col("empl_industry_691L").n_unique().alias("unique_industries"),
    pl.col("empladdr_zipcode_114M").n_unique().alias("unique_employer_zipcodes"),
    pl.col("contaddr_zipcode_807M").n_unique().alias("unique_contact_zipcodes"),
    pl.col("registaddr_zipcode_184M").n_unique().alias("unique_registered_zipcodes"),
    pl.col("language1_981M").n_unique().alias("unique_languages"),
    pl.col("familystate_447L").n_unique().alias("unique_family_states"),
    pl.col("contaddr_matchlist_1032L").sum().alias("sum_contact_address_matchlist"),
    pl.col("contaddr_smempladdr_334L").sum().alias("sum_contact_same_employer_address"),
    pl.col("personindex_1023L").n_unique().alias("unique_person_indices")
]

df_lazy_4 = df_lazy_4.group_by("case_id").agg(transformations_2)

columns_to_drop = [
    "birth_259D", "birthdate_87D", "empl_employedfrom_271D", "childnum_185L", "education_927M",
    "empl_employedtotal_800L", "mainoccupationinc_384A", "gender_992L", "housetype_905L", "housingtype_772L",
    "incometype_1044T", "maritalst_703L", "persontype_1072L", "persontype_792L", "relationshiptoclient_415T",
    "relationshiptoclient_642T", "remitter_829L", "role_1084L", "role_993L", "safeguarantyflag_411L", "sex_738L",
    "type_25L", "contaddr_district_15M", "empladdr_district_926M", "registaddr_district_1083M", "isreference_387L",
    "empl_industry_691L", "empladdr_zipcode_114M", "contaddr_zipcode_807M", "registaddr_zipcode_184M",
    "language1_981M", "familystate_447L", "contaddr_matchlist_1032L", "contaddr_smempladdr_334L", "personindex_1023L",
    "empl_employedfrom_271D"
]

df_lazy_4 = df_lazy_4.drop(columns_to_drop)

In [31]:
df_lazy_4.head()

case_id,unique_birth_dates,unique_birth_dates_87D,max_children,unique_educations,avg_employment_length,total_main_income,unique_genders,unique_house_types,unique_housing_types,unique_income_types,unique_marital_statuses,unique_person_types_1072L,unique_person_types_792L,unique_relationships_415T,unique_relationships_642T,sum_remitters,unique_roles_1084L,unique_roles_993L,sum_safeguaranty_flags,unique_sexes,unique_contact_types,unique_contact_address_districts,unique_employer_address_districts,unique_registered_address_districts,sum_is_reference_flags,unique_industries,unique_employer_zipcodes,unique_contact_zipcodes,unique_registered_zipcodes,unique_languages,unique_family_states,sum_contact_address_matchlist,sum_contact_same_employer_address,unique_person_indices
i64,u32,u32,f64,u32,str,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1815734,1,1,,1,,34000.0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1
1700897,1,1,,1,,40000.0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1
727777,2,1,,2,,24000.0,1,1,1,2,1,2,2,2,2,0,2,1,1,2,2,2,1,2,0,1,1,2,2,2,2,0,0,2
670360,2,1,,2,,46000.0,1,1,1,2,1,2,3,2,2,0,3,1,1,2,2,2,2,2,0,2,2,2,2,2,2,0,0,3
1311733,1,1,,1,,50000.0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1


In [32]:
df_joined = df_joined.lazy()
df_lazy_4 = df_lazy_4.lazy()

df_joined = df_joined.join(
    df_lazy_4,
    on="case_id",
    how="left"
)

df_joined = df_joined.collect()

In [33]:
df_joined.head()

case_id,actualdpd_943P_mean,annuity_853A_sum,childnum_21L_sum,credacc_actualbalance_314A_mean,credacc_credlmt_575A_mean,credacc_maxhisbal_375A_max,credacc_minhisbal_90A_min,credacc_transactions_402L_sum,credamount_590A_mean,currdebt_94A_mean,downpmt_134A_sum,mainoccupationinc_437A_mean,outstandingdebt_522A_sum,pmtnum_8L_max,tenor_203L_min,isbidproduct_390L_sum,isdebitcard_527L_sum,credacc_status_367L_n_unique,credtype_587L_n_unique,education_1138M_n_unique,familystate_726L_n_unique,postype_4733339M_n_unique,profession_152M_n_unique,rejectreason_755M_n_unique,rejectreasonclient_4145042M_n_unique,status_219L_n_unique,approval_to_activation_min_diff,creation_min_diff,payment_max_diff,earliest_employment_date,byoccupationinc_3656910L_n_unique,cancelreason_3545846M_n_unique,district_544M_n_unique,earliest_last_payment_date,earliest_first_nonzero_installment_date,inittransactioncode_279L_n_unique,…,average_amount,open_contracts_count,closed_contracts_count,unique_birth_dates,unique_birth_dates_87D,max_children,unique_educations,avg_employment_length,total_main_income,unique_genders,unique_house_types,unique_housing_types,unique_income_types,unique_marital_statuses,unique_person_types_1072L,unique_person_types_792L,unique_relationships_415T,unique_relationships_642T,sum_remitters,unique_roles_1084L,unique_roles_993L,sum_safeguaranty_flags,unique_sexes,unique_contact_types,unique_contact_address_districts,unique_employer_address_districts,unique_registered_address_districts,sum_is_reference_flags,unique_industries,unique_employer_zipcodes,unique_contact_zipcodes,unique_registered_zipcodes,unique_languages,unique_family_states,sum_contact_address_matchlist,sum_contact_same_employer_address,unique_person_indices
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,duration[ms],duration[ms],duration[ms],date,u32,u32,u32,date,date,u32,…,f64,u32,u32,u32,u32,f64,u32,str,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
182207,0.0,29913.6002,0.0,,0.0,,,0.0,26387.65,0.0,0.0,45250.0,0.0,16.0,3.0,0,0,1,2,2,2,1,1,2,2,3,15d,1d,114d,2018-01-15,1,3,2,2019-05-25,2018-05-01,2,…,,,,2,1,,2,,20000.0,1,1,1,2,1,3,4,3,3,0,3,1,1,2,2,2,2,2,0,2,2,2,2,2,2,0,0,4
877588,0.0,3330.4001,0.0,,55208.8,,,0.0,68527.533333,,0.0,86666.666667,0.0,16.0,16.0,0,0,1,2,2,2,1,1,3,2,2,,0ms,,2005-01-15,1,2,1,,2019-02-01,1,…,,,,2,1,,2,,60000.0,1,1,1,2,1,2,3,2,2,0,3,1,1,2,2,2,2,2,0,2,2,2,2,2,2,0,0,3
1547933,0.0,2222.0,0.0,,0.0,,,0.0,10896.4,0.0,695.60004,39000.0,0.0,6.0,6.0,0,0,1,1,1,1,1,1,1,1,1,,,,2014-02-15,1,1,1,2019-10-03,2019-05-15,1,…,,,,1,1,,1,,46000.0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1
161640,0.0,5347.0,5.0,,0.0,,,0.0,8193.333333,0.0,0.0,30333.333333,0.0,6.0,4.0,0,0,1,2,2,2,1,1,2,2,2,,137d,,2007-01-15,1,2,1,2017-12-06,2017-09-08,2,…,,,,2,1,,2,,20000.0,1,1,1,2,1,2,3,3,3,0,3,1,1,2,2,2,2,2,0,2,2,2,2,2,2,0,0,4
176318,0.0,21445.6003,0.0,,0.0,,,0.0,96475.0,57664.906,500.0,44050.0,80393.2,36.0,24.0,0,0,1,2,2,4,1,1,1,1,3,,152d,,2006-04-15,2,2,2,2019-11-06,2008-09-07,2,…,,,,2,1,,2,,80200.0,1,1,1,2,1,2,3,3,3,0,3,1,1,2,4,2,2,2,0,2,2,2,2,2,2,0,0,4


In [34]:
import polars as pl

df_lazy_5 = pl.read_csv("Data/csv_files/train/train_debitcard_1.csv")

In [35]:
df_lazy_5.head()

case_id,last180dayaveragebalance_704A,last180dayturnover_1134A,last30dayturnover_651A,num_group1,openingdate_857D
i64,str,str,str,i64,str
225,,,,0,"""2016-08-16"""
331,,,,0,"""2015-03-19"""
358,,,,0,"""2014-09-02"""
390,,,,0,"""2014-07-23"""
390,,,,2,"""2016-06-08"""


In [36]:
df_lazy_5 = df_lazy_5.with_columns([
    pl.col("last180dayaveragebalance_704A").cast(pl.Float64).fill_null(0).alias("last180dayaveragebalance_704A"),
    pl.col("last180dayturnover_1134A").cast(pl.Float64).fill_null(0).alias("last180dayturnover_1134A"),
    pl.col("last30dayturnover_651A").cast(pl.Float64).fill_null(0).alias("last30dayturnover_651A")
])

df_lazy_5 = df_lazy_5.with_columns(
    pl.col("openingdate_857D").str.strptime(pl.Date, "%Y-%m-%d").alias("parsed_openingdate")
).drop("openingdate_857D")
       
df_lazy_5 = df_lazy_5.group_by("case_id").agg([
    pl.sum("last180dayaveragebalance_704A").alias("total_180dayaveragebalance"),
    pl.sum("last180dayturnover_1134A").alias("total_180dayturnover"),
    pl.sum("last30dayturnover_651A").alias("total_30dayturnover"),
    pl.min("parsed_openingdate").alias("earliest_openingdate")
])

In [37]:
df_joined = df_joined.lazy()
df_lazy_5 = df_lazy_5.lazy()

df_joined = df_joined.join(
    df_lazy_5,
    on="case_id",
    how="left"
)

df_joined = df_joined.collect()

In [38]:
df_joined.head()

case_id,actualdpd_943P_mean,annuity_853A_sum,childnum_21L_sum,credacc_actualbalance_314A_mean,credacc_credlmt_575A_mean,credacc_maxhisbal_375A_max,credacc_minhisbal_90A_min,credacc_transactions_402L_sum,credamount_590A_mean,currdebt_94A_mean,downpmt_134A_sum,mainoccupationinc_437A_mean,outstandingdebt_522A_sum,pmtnum_8L_max,tenor_203L_min,isbidproduct_390L_sum,isdebitcard_527L_sum,credacc_status_367L_n_unique,credtype_587L_n_unique,education_1138M_n_unique,familystate_726L_n_unique,postype_4733339M_n_unique,profession_152M_n_unique,rejectreason_755M_n_unique,rejectreasonclient_4145042M_n_unique,status_219L_n_unique,approval_to_activation_min_diff,creation_min_diff,payment_max_diff,earliest_employment_date,byoccupationinc_3656910L_n_unique,cancelreason_3545846M_n_unique,district_544M_n_unique,earliest_last_payment_date,earliest_first_nonzero_installment_date,inittransactioncode_279L_n_unique,…,unique_birth_dates_87D,max_children,unique_educations,avg_employment_length,total_main_income,unique_genders,unique_house_types,unique_housing_types,unique_income_types,unique_marital_statuses,unique_person_types_1072L,unique_person_types_792L,unique_relationships_415T,unique_relationships_642T,sum_remitters,unique_roles_1084L,unique_roles_993L,sum_safeguaranty_flags,unique_sexes,unique_contact_types,unique_contact_address_districts,unique_employer_address_districts,unique_registered_address_districts,sum_is_reference_flags,unique_industries,unique_employer_zipcodes,unique_contact_zipcodes,unique_registered_zipcodes,unique_languages,unique_family_states,sum_contact_address_matchlist,sum_contact_same_employer_address,unique_person_indices,total_180dayaveragebalance,total_180dayturnover,total_30dayturnover,earliest_openingdate
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,duration[ms],duration[ms],duration[ms],date,u32,u32,u32,date,date,u32,…,u32,f64,u32,str,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,date
182207,0.0,29913.6002,0.0,,0.0,,,0.0,26387.65,0.0,0.0,45250.0,0.0,16.0,3.0,0,0,1,2,2,2,1,1,2,2,3,15d,1d,114d,2018-01-15,1,3,2,2019-05-25,2018-05-01,2,…,1,,2,,20000.0,1,1,1,2,1,3,4,3,3,0,3,1,1,2,2,2,2,2,0,2,2,2,2,2,2,0,0,4,,,,
877588,0.0,3330.4001,0.0,,55208.8,,,0.0,68527.533333,,0.0,86666.666667,0.0,16.0,16.0,0,0,1,2,2,2,1,1,3,2,2,,0ms,,2005-01-15,1,2,1,,2019-02-01,1,…,1,,2,,60000.0,1,1,1,2,1,2,3,2,2,0,3,1,1,2,2,2,2,2,0,2,2,2,2,2,2,0,0,3,,,,
1547933,0.0,2222.0,0.0,,0.0,,,0.0,10896.4,0.0,695.60004,39000.0,0.0,6.0,6.0,0,0,1,1,1,1,1,1,1,1,1,,,,2014-02-15,1,1,1,2019-10-03,2019-05-15,1,…,1,,1,,46000.0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1,,,,
161640,0.0,5347.0,5.0,,0.0,,,0.0,8193.333333,0.0,0.0,30333.333333,0.0,6.0,4.0,0,0,1,2,2,2,1,1,2,2,2,,137d,,2007-01-15,1,2,1,2017-12-06,2017-09-08,2,…,1,,2,,20000.0,1,1,1,2,1,2,3,3,3,0,3,1,1,2,2,2,2,2,0,2,2,2,2,2,2,0,0,4,,,,
176318,0.0,21445.6003,0.0,,0.0,,,0.0,96475.0,57664.906,500.0,44050.0,80393.2,36.0,24.0,0,0,1,2,2,4,1,1,1,1,3,,152d,,2006-04-15,2,2,2,2019-11-06,2008-09-07,2,…,1,,2,,80200.0,1,1,1,2,1,2,3,3,3,0,3,1,1,2,4,2,2,2,0,2,2,2,2,2,2,0,0,4,,,,


In [39]:
duration_columns = [
    "approval_to_activation_min_diff",
    "creation_min_diff",
    "payment_max_diff"
]

for column in duration_columns:
    df_joined = df_joined.with_columns(
        pl.col(column).str.replace("d", "").cast(pl.Int64) * 1440
    )

In [40]:
train_DEPTH_1 = df_joined

directory = r"C:\Users\afise\.git\CreditRiskModel\Merged_Data"
filename = "train_DEPTH_1.csv"
file_path = os.path.join(directory, filename)

if not os.path.exists(directory):
    os.makedirs(directory)

train_DEPTH_1.write_csv(file_path)