In [1]:
from datetime import date
import polars as pl
import os

train_applprev_1_0 = pl.read_csv("Data/csv_files/train/train_applprev_1_0.csv")
train_applprev_1_1 = pl.read_csv("Data/csv_files/train/train_applprev_1_1.csv")

In [2]:
train_applprev_1 = pl.concat([train_applprev_1_0, train_applprev_1_1])

directory = r"C:\Users\afise\.git\CreditRiskModel\Merged_Data"
filename = "train_applprev_1.csv"
file_path = os.path.join(directory, filename)

In [3]:
train_applprev_1 = train_applprev_1.lazy()

In [4]:
def try_parse_date(col, fmt1, fmt2):
    date1 = col.str.strptime(pl.Date, fmt1, strict=False)
    date2 = col.str.strptime(pl.Date, fmt2, strict=False)
    return pl.when(date1.is_not_null()).then(date1).otherwise(date2)

train_applprev_1 = (
    train_applprev_1
    .with_columns([
        try_parse_date(pl.col("approvaldate_319D"), "%m/%d/%Y", "%Y-%m-%d").alias("approvaldate_319D"),
        try_parse_date(pl.col("dateactivated_425D"), "%m/%d/%Y", "%Y-%m-%d").alias("dateactivated_425D"),
        try_parse_date(pl.col("creationdate_885D"), "%m/%d/%Y", "%Y-%m-%d").alias("creationdate_885D"),
        try_parse_date(pl.col("dtlastpmt_581D"), "%m/%d/%Y", "%Y-%m-%d").alias("dtlastpmt_581D"),
        try_parse_date(pl.col("employedfrom_700D"), "%m/%d/%Y", "%Y-%m-%d").alias("employedfrom_700D"),
        try_parse_date(pl.col("dtlastpmtallstes_3545839D"), "%m/%d/%Y", "%Y-%m-%d").alias("dtlastpmtallstes_3545839D"),
        try_parse_date(pl.col("firstnonzeroinstldate_307D"), "%m/%d/%Y", "%Y-%m-%d").alias("firstnonzeroinstldate_307D"),
    ])
    .group_by("case_id")
    .agg([
        pl.col("actualdpd_943P").mean().alias("actualdpd_943P_mean"),
        pl.col("annuity_853A").sum().alias("annuity_853A_sum"),
        pl.col("childnum_21L").sum().alias("childnum_21L_sum"),
        pl.col("credacc_actualbalance_314A").mean().alias("credacc_actualbalance_314A_mean"),
        pl.col("credacc_credlmt_575A").mean().alias("credacc_credlmt_575A_mean"),
        pl.col("credacc_maxhisbal_375A").max().alias("credacc_maxhisbal_375A_max"),
        pl.col("credacc_minhisbal_90A").min().alias("credacc_minhisbal_90A_min"),
        pl.col("credacc_transactions_402L").sum().alias("credacc_transactions_402L_sum"),
        pl.col("credamount_590A").mean().alias("credamount_590A_mean"),
        pl.col("currdebt_94A").mean().alias("currdebt_94A_mean"),
        pl.col("downpmt_134A").sum().alias("downpmt_134A_sum"),
        pl.col("mainoccupationinc_437A").mean().alias("mainoccupationinc_437A_mean"),
        pl.col("outstandingdebt_522A").sum().alias("outstandingdebt_522A_sum"),
        pl.col("pmtnum_8L").max().alias("pmtnum_8L_max"),
        pl.col("tenor_203L").min().alias("tenor_203L_min"),
        pl.col("isbidproduct_390L").cast(pl.UInt32).sum().alias("isbidproduct_390L_sum"),
        pl.col("isdebitcard_527L").cast(pl.UInt32).sum().alias("isdebitcard_527L_sum"),
        pl.col("credacc_status_367L").n_unique().alias("credacc_status_367L_n_unique"),
        pl.col("credtype_587L").n_unique().alias("credtype_587L_n_unique"),
        pl.col("education_1138M").n_unique().alias("education_1138M_n_unique"),
        pl.col("familystate_726L").n_unique().alias("familystate_726L_n_unique"),
        pl.col("postype_4733339M").n_unique().alias("postype_4733339M_n_unique"),
        pl.col("profession_152M").n_unique().alias("profession_152M_n_unique"),
        pl.col("rejectreason_755M").n_unique().alias("rejectreason_755M_n_unique"),
        pl.col("rejectreasonclient_4145042M").n_unique().alias("rejectreasonclient_4145042M_n_unique"),
        pl.col("status_219L").n_unique().alias("status_219L_n_unique"),
        (pl.col("approvaldate_319D").diff().abs().min()).alias("approval_to_activation_min_diff"),
        (pl.col("creationdate_885D").diff().abs().min()).alias("creation_min_diff"),
        (pl.col("dtlastpmt_581D").diff().abs().max()).alias("payment_max_diff"),
        pl.col("employedfrom_700D").min().alias("earliest_employment_date"),
        pl.col("byoccupationinc_3656910L").n_unique().alias("byoccupationinc_3656910L_n_unique"),
        pl.col("cancelreason_3545846M").n_unique().alias("cancelreason_3545846M_n_unique"),
        pl.col("district_544M").n_unique().alias("district_544M_n_unique"),
        pl.col("dtlastpmtallstes_3545839D").min().alias("earliest_last_payment_date"),
        pl.col("firstnonzeroinstldate_307D").min().alias("earliest_first_nonzero_installment_date"),
        pl.col("inittransactioncode_279L").n_unique().alias("inittransactioncode_279L_n_unique"),
        pl.col("maxdpdtolerance_577P").max().alias("maximum_dpd_tolerance"),
        pl.col("revolvingaccount_394A").sum().alias("sum_revolving_accounts"),
    ])
    .collect()
)

In [5]:
train_other_1 = pl.scan_csv("Data/csv_files/train/train_other_1.csv")

In [6]:
train_other_1 = (
    train_other_1.group_by("case_id")
    .agg([
        pl.col("amtdebitincoming_4809443A").sum().alias("sum_amtdebitincoming"),
        pl.col("amtdebitoutgoing_4809440A").sum().alias("sum_amtdebitoutgoing"),
        pl.col("amtdepositbalance_4809441A").mean().alias("avg_amtdepositbalance"),
        pl.col("amtdepositincoming_4809444A").sum().alias("sum_amtdepositincoming"),
        pl.col("amtdepositoutgoing_4809442A").sum().alias("sum_amtdepositoutgoing"),
    ])
)

train_other_1 = train_other_1.collect()

In [7]:
train_applprev_1 = train_applprev_1.lazy()
train_other_1 = train_other_1.lazy()

df_joined = train_applprev_1.join(
    train_other_1,
    on="case_id",
    how="left"
)

df_joined = df_joined.collect()

In [8]:
train_deposit_1 = pl.scan_csv("Data/csv_files/train/train_deposit_1.csv")

In [9]:
transformations = (
    train_deposit_1
    .group_by("case_id")
    .agg([
        pl.col("amount_416A").mean().alias("average_amount"),
        pl.count("openingdate_313D").alias("open_contracts_count"),
        pl.count("contractenddate_991D").alias("closed_contracts_count"),
    ])
)

train_deposit_1 = transformations.collect()

In [10]:
df_joined = df_joined.lazy()
train_deposit_1 = train_deposit_1.lazy()

df_joined = df_joined.join(
    train_deposit_1,
    on="case_id",
    how="left"
)

df_joined = df_joined.collect()

In [11]:
train_person_1 = pl.read_csv("Data/csv_files/train/train_person_1.csv")

In [12]:
transformations_2 = [
    pl.col("birth_259D").n_unique().alias("unique_birth_dates"),
    pl.col("birthdate_87D").n_unique().alias("unique_birth_dates_87D"),
    pl.col("childnum_185L").max().alias("max_children"),
    pl.col("education_927M").n_unique().alias("unique_educations"),
    pl.col("empl_employedtotal_800L").mean().alias("avg_employment_length"),
    pl.col("mainoccupationinc_384A").sum().alias("total_main_income"),
    pl.col("gender_992L").n_unique().alias("unique_genders"),
    pl.col("housetype_905L").n_unique().alias("unique_house_types"),
    pl.col("housingtype_772L").n_unique().alias("unique_housing_types"),
    pl.col("incometype_1044T").n_unique().alias("unique_income_types"),
    pl.col("maritalst_703L").n_unique().alias("unique_marital_statuses"),
    pl.col("persontype_1072L").n_unique().alias("unique_person_types_1072L"),
    pl.col("persontype_792L").n_unique().alias("unique_person_types_792L"),
    pl.col("relationshiptoclient_415T").n_unique().alias("unique_relationships_415T"),
    pl.col("relationshiptoclient_642T").n_unique().alias("unique_relationships_642T"),
    pl.col("remitter_829L").sum().alias("sum_remitters"),
    pl.col("role_1084L").n_unique().alias("unique_roles_1084L"),
    pl.col("role_993L").n_unique().alias("unique_roles_993L"),
    pl.col("safeguarantyflag_411L").sum().alias("sum_safeguaranty_flags"),
    pl.col("sex_738L").n_unique().alias("unique_sexes"),
    pl.col("type_25L").n_unique().alias("unique_contact_types"),
    pl.col("contaddr_district_15M").n_unique().alias("unique_contact_address_districts"),
    pl.col("empladdr_district_926M").n_unique().alias("unique_employer_address_districts"),
    pl.col("registaddr_district_1083M").n_unique().alias("unique_registered_address_districts"),
    pl.col("isreference_387L").sum().alias("sum_is_reference_flags"),
    pl.col("empl_industry_691L").n_unique().alias("unique_industries"),
    pl.col("empladdr_zipcode_114M").n_unique().alias("unique_employer_zipcodes"),
    pl.col("contaddr_zipcode_807M").n_unique().alias("unique_contact_zipcodes"),
    pl.col("registaddr_zipcode_184M").n_unique().alias("unique_registered_zipcodes"),
    pl.col("language1_981M").n_unique().alias("unique_languages"),
    pl.col("familystate_447L").n_unique().alias("unique_family_states"),
    pl.col("contaddr_matchlist_1032L").sum().alias("sum_contact_address_matchlist"),
    pl.col("contaddr_smempladdr_334L").sum().alias("sum_contact_same_employer_address"),
    pl.col("personindex_1023L").n_unique().alias("unique_person_indices")
]

train_person_1 = train_person_1.group_by("case_id").agg(transformations_2)

columns_to_drop = [
    "birth_259D", "birthdate_87D", "empl_employedfrom_271D", "childnum_185L", "education_927M",
    "empl_employedtotal_800L", "mainoccupationinc_384A", "gender_992L", "housetype_905L", "housingtype_772L",
    "incometype_1044T", "maritalst_703L", "persontype_1072L", "persontype_792L", "relationshiptoclient_415T",
    "relationshiptoclient_642T", "remitter_829L", "role_1084L", "role_993L", "safeguarantyflag_411L", "sex_738L",
    "type_25L", "contaddr_district_15M", "empladdr_district_926M", "registaddr_district_1083M", "isreference_387L",
    "empl_industry_691L", "empladdr_zipcode_114M", "contaddr_zipcode_807M", "registaddr_zipcode_184M",
    "language1_981M", "familystate_447L", "contaddr_matchlist_1032L", "contaddr_smempladdr_334L", "personindex_1023L",
    "empl_employedfrom_271D"
]

train_person_1 = train_person_1.drop(columns_to_drop)

In [13]:
df_joined = df_joined.lazy()
train_person_1 = train_person_1.lazy()

df_joined = df_joined.join(
    train_person_1,
    on="case_id",
    how="left"
)

df_joined = df_joined.collect()

In [14]:
train_debitcard_1 = pl.read_csv("Data/csv_files/train/train_debitcard_1.csv")

In [15]:
train_debitcard_1 = train_debitcard_1.with_columns([
    pl.col("last180dayaveragebalance_704A").cast(pl.Float64).fill_null(0).alias("last180dayaveragebalance_704A"),
    pl.col("last180dayturnover_1134A").cast(pl.Float64).fill_null(0).alias("last180dayturnover_1134A"),
    pl.col("last30dayturnover_651A").cast(pl.Float64).fill_null(0).alias("last30dayturnover_651A")
])

train_debitcard_1 = train_debitcard_1.with_columns(
    pl.col("openingdate_857D").str.strptime(pl.Date, "%Y-%m-%d").alias("parsed_openingdate")
).drop("openingdate_857D")
       
train_debitcard_1 = train_debitcard_1.group_by("case_id").agg([
    pl.sum("last180dayaveragebalance_704A").alias("total_180dayaveragebalance"),
    pl.sum("last180dayturnover_1134A").alias("total_180dayturnover"),
    pl.sum("last30dayturnover_651A").alias("total_30dayturnover"),
    pl.min("parsed_openingdate").alias("earliest_openingdate")
])

In [16]:
df_joined = df_joined.lazy()
train_debitcard_1 = train_debitcard_1.lazy()

df_joined = df_joined.join(
    train_debitcard_1,
    on="case_id",
    how="left"
)

df_joined = df_joined.collect()

In [17]:
duration_columns = [
    "approval_to_activation_min_diff",
    "creation_min_diff",
    "payment_max_diff"
]

for column in duration_columns:
    df_joined = df_joined.with_columns(
        pl.col(column).str.replace("d", "").cast(pl.Int64) * 1440
    )

In [18]:
column_names = df_joined.columns
column_types = df_joined.dtypes

date_columns = [name for name, dtype in zip(column_names, column_types) if dtype == pl.Date]
string_columns = [name for name, dtype in zip(column_names, column_types) if dtype == pl.Utf8]

def convert_to_ordinal(date):
    return pl.when(date.is_not_null()).then(
        (date.dt.year() * 365) + (date.dt.month() * 30) + date.dt.day()
    ).otherwise(None)

for col in date_columns:
    df_joined = df_joined.with_columns(
        convert_to_ordinal(pl.col(col)).alias(col)
    )
df_joined = df_joined.drop(string_columns)

In [19]:
train_DEPTH_1 = df_joined

directory = r"C:\Users\afise\.git\CreditRiskModel\Merged_Data"
filename = "train_DEPTH_1.csv"
file_path = os.path.join(directory, filename)

if not os.path.exists(directory):
    os.makedirs(directory)

train_DEPTH_1.write_csv(file_path)