In [None]:

%pip install polars

import pandas as pd
import polars as pl
import lightgbm as lgb
import matplotlib.pyplot as plt
import json
from pandas.api.types import CategoricalDtype
from datetime import datetime
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve, roc_auc_score

class DataPipeline_Depth_1:
    def __init__(self, applprev_paths, other_path, deposit_path, person_path, debitcard_path, tax_registry_a_1_path, tax_registry_b_1_path, 
                                    tax_registry_c_1_path, credit_bureau_a_1_paths, credit_bureau_b_1_path, schema_path):
        self.applprev_paths = applprev_paths
        self.other_path = other_path
        self.deposit_path = deposit_path
        self.person_path = person_path
        self.debitcard_path = debitcard_path
        self.tax_registry_a_1_path = tax_registry_a_1_path
        self.tax_registry_b_1_path = tax_registry_b_1_path
        self.tax_registry_c_1_path = tax_registry_c_1_path
        self.credit_bureau_a_1_paths = credit_bureau_a_1_paths
        self.credit_bureau_b_1_path = credit_bureau_b_1_path
        self.schema_path = schema_path
        self.global_schema = {}

    @staticmethod
    def try_parse_date(col, fmt1, fmt2):
        date1 = col.str.strptime(pl.Date, fmt1, strict=False)
        date2 = col.str.strptime(pl.Date, fmt2, strict=False)
        return pl.when(date1.is_not_null()).then(date1).otherwise(date2)

    @staticmethod
    def convert_to_ordinal(date):
        return pl.when(date.is_not_null()).then(
            (date.dt.year() * 365) + (date.dt.month() * 30) + date.dt.day()
        ).otherwise(None)

    def load_data(self):
        self.train_applprev_1 = pl.concat([self.load_and_update_schema(path) for path in self.applprev_paths]).lazy()
        self.train_other_1 = self.load_and_update_schema(self.other_path).lazy()
        self.train_deposit_1 = self.load_and_update_schema(self.deposit_path).lazy()
        self.train_person_1 = self.load_and_update_schema(self.person_path).lazy()
        self.train_debitcard_1 = self.load_and_update_schema(self.debitcard_path).lazy()
        self.train_tax_registry_a_1 = self.load_and_update_schema(self.tax_registry_a_1_path).lazy()
        self.train_tax_registry_b_1 = self.load_and_update_schema(self.tax_registry_b_1_path).lazy()
        self.train_tax_registry_c_1 = self.load_and_update_schema(self.tax_registry_c_1_path).lazy()
        self.train_credit_bureau_a_1 = pl.concat([self.load_and_update_schema(path) for path in self.credit_bureau_a_1_paths]).lazy()
        self.train_credit_bureau_b_1 = self.load_and_update_schema(self.credit_bureau_b_1_path).lazy()

    def load_and_update_schema(self, path):
        df = pl.read_parquet(path)
        self.update_schema(df)
        return df

    def update_schema(self, dataframe):
        for col, dtype in zip(dataframe.columns, dataframe.dtypes):
            if col not in self.global_schema:
                self.global_schema[col] = str(dtype)

    def save_schema(self):
        with open(self.schema_path, 'w') as file:
            json.dump(self.global_schema, file)

    def preprocess_data(self):

        date_formats = ("%m/%d/%Y", "%Y-%m-%d")
        date_columns = ["approvaldate_319D", "dateactivated_425D", "creationdate_885D", "dtlastpmt_581D", "employedfrom_700D", "dtlastpmtallstes_3545839D", "firstnonzeroinstldate_307D"]
        self.train_applprev_1 = self.train_applprev_1.with_columns([
            DataPipeline_Depth_1.try_parse_date(pl.col(col), *date_formats).alias(col) for col in date_columns
        ]).group_by("case_id").agg([
            pl.col("actualdpd_943P").mean().alias("actualdpd_943P_mean"),
            pl.col("annuity_853A").sum().alias("annuity_853A_sum"),
            pl.col("childnum_21L").sum().alias("childnum_21L_sum"),
            pl.col("credacc_actualbalance_314A").mean().alias("credacc_actualbalance_314A_mean"),
            pl.col("credacc_credlmt_575A").mean().alias("credacc_credlmt_575A_mean"),
            pl.col("credacc_maxhisbal_375A").max().alias("credacc_maxhisbal_375A_max"),
            pl.col("credacc_minhisbal_90A").min().alias("credacc_minhisbal_90A_min"),
            pl.col("credacc_transactions_402L").sum().alias("credacc_transactions_402L_sum"),
            pl.col("credamount_590A").mean().alias("credamount_590A_mean"),
            pl.col("currdebt_94A").mean().alias("currdebt_94A_mean"),
            pl.col("downpmt_134A").sum().alias("downpmt_134A_sum"),
            pl.col("mainoccupationinc_437A").mean().alias("mainoccupationinc_437A_mean"),
            pl.col("outstandingdebt_522A").sum().alias("outstandingdebt_522A_sum"),
            pl.col("pmtnum_8L").max().alias("pmtnum_8L_max"),
            pl.col("tenor_203L").min().alias("tenor_203L_min"),
            pl.col("isbidproduct_390L").cast(pl.UInt32).sum().alias("isbidproduct_390L_sum"),
            pl.col("isdebitcard_527L").cast(pl.UInt32).sum().alias("isdebitcard_527L_sum"),
            pl.col("credacc_status_367L").n_unique().alias("credacc_status_367L_n_unique"),
            pl.col("credtype_587L").n_unique().alias("credtype_587L_n_unique"),
            pl.col("education_1138M").n_unique().alias("education_1138M_n_unique"),
            pl.col("familystate_726L").n_unique().alias("familystate_726L_n_unique"),
            pl.col("postype_4733339M").n_unique().alias("postype_4733339M_n_unique"),
            pl.col("profession_152M").n_unique().alias("profession_152M_n_unique"),
            pl.col("rejectreason_755M").n_unique().alias("rejectreason_755M_n_unique"),
            pl.col("rejectreasonclient_4145042M").n_unique().alias("rejectreasonclient_4145042M_n_unique"),
            pl.col("status_219L").n_unique().alias("status_219L_n_unique"),
            (pl.col("approvaldate_319D").diff().abs().min()).alias("approval_to_activation_min_diff"),
            (pl.col("creationdate_885D").diff().abs().min()).alias("creation_min_diff"),
            (pl.col("dtlastpmt_581D").diff().abs().max()).alias("payment_max_diff"),
            pl.col("employedfrom_700D").min().alias("earliest_employment_date"),
            pl.col("byoccupationinc_3656910L").n_unique().alias("byoccupationinc_3656910L_n_unique"),
            pl.col("cancelreason_3545846M").n_unique().alias("cancelreason_3545846M_n_unique"),
            pl.col("district_544M").n_unique().alias("district_544M_n_unique"),
            pl.col("dtlastpmtallstes_3545839D").min().alias("earliest_last_payment_date"),
            pl.col("firstnonzeroinstldate_307D").min().alias("earliest_first_nonzero_installment_date"),
            pl.col("inittransactioncode_279L").n_unique().alias("inittransactioncode_279L_n_unique"),
            pl.col("maxdpdtolerance_577P").max().alias("maximum_dpd_tolerance"),
            pl.col("revolvingaccount_394A").sum().alias("sum_revolving_accounts")
        ])

        self.train_other_1 = self.train_other_1.group_by("case_id").agg([
            pl.col("amtdebitincoming_4809443A").sum().alias("sum_amtdebitincoming"),
            pl.col("amtdebitoutgoing_4809440A").sum().alias("sum_amtdebitoutgoing"),
            pl.col("amtdepositbalance_4809441A").mean().alias("avg_amtdepositbalance"),
            pl.col("amtdepositincoming_4809444A").sum().alias("sum_amtdepositincoming"),
            pl.col("amtdepositoutgoing_4809442A").sum().alias("sum_amtdepositoutgoing")
        ])

        self.train_deposit_1 = self.train_deposit_1.group_by("case_id").agg([
            pl.col("amount_416A").mean().alias("average_amount"),
            pl.count("openingdate_313D").alias("open_contracts_count"),
            pl.count("contractenddate_991D").alias("closed_contracts_count")
        ])

        date_format = ("%m/%d/%Y", "%Y-%m-%d")
        self.train_person_1 = self.train_person_1.with_columns(
            DataPipeline_Depth_1.try_parse_date(pl.col("empl_employedfrom_271D"), *date_format).alias("empl_employedfrom_271D")
        )
        self.train_person_1 = self.train_person_1.with_columns(
            DataPipeline_Depth_1.convert_to_ordinal(pl.col("empl_employedfrom_271D")).alias("ordinal_employedfrom_271D")
        )
        self.train_person_1 = self.train_person_1.group_by("case_id").agg([
            pl.col("birth_259D").n_unique().alias("unique_birth_dates"),
            pl.col("birthdate_87D").n_unique().alias("unique_birth_dates_87D"),
            pl.col("childnum_185L").max().alias("max_children"),
            pl.col("education_927M").n_unique().alias("unique_educations"),
            pl.col("empl_employedtotal_800L").n_unique().alias("avg_employment_length"),
            pl.col("mainoccupationinc_384A").sum().alias("total_main_income"),
            pl.col("gender_992L").n_unique().alias("unique_genders"),
            pl.col("housetype_905L").n_unique().alias("unique_house_types"),
            pl.col("housingtype_772L").n_unique().alias("unique_housing_types"),
            pl.col("incometype_1044T").n_unique().alias("unique_income_types"),
            pl.col("maritalst_703L").n_unique().alias("unique_marital_statuses"),
            pl.col("persontype_1072L").n_unique().alias("unique_person_types_1072L"),
            pl.col("persontype_792L").n_unique().alias("unique_person_types_792L"),
            pl.col("relationshiptoclient_415T").n_unique().alias("unique_relationships_415T"),
            pl.col("relationshiptoclient_642T").n_unique().alias("unique_relationships_642T"),
            pl.col("remitter_829L").sum().alias("sum_remitters"),
            pl.col("role_1084L").n_unique().alias("unique_roles_1084L"),
            pl.col("role_993L").n_unique().alias("unique_roles_993L"),
            pl.col("safeguarantyflag_411L").sum().alias("sum_safeguaranty_flags"),
            pl.col("sex_738L").n_unique().alias("unique_sexes"),
            pl.col("type_25L").n_unique().alias("unique_contact_types"),
            pl.col("contaddr_district_15M").n_unique().alias("unique_contact_address_districts"),
            pl.col("empladdr_district_926M").n_unique().alias("unique_employer_address_districts"),
            pl.col("registaddr_district_1083M").n_unique().alias("unique_registered_address_districts"),
            pl.col("isreference_387L").sum().alias("sum_is_reference_flags"),
            pl.col("empl_industry_691L").n_unique().alias("unique_industries"),
            pl.col("empladdr_zipcode_114M").n_unique().alias("unique_employer_zipcodes"),
            pl.col("contaddr_zipcode_807M").n_unique().alias("unique_contact_zipcodes"),
            pl.col("registaddr_zipcode_184M").n_unique().alias("unique_registered_zipcodes"),
            pl.col("language1_981M").n_unique().alias("unique_languages"),
            pl.col("familystate_447L").n_unique().alias("unique_family_states"),
            pl.col("contaddr_matchlist_1032L").sum().alias("sum_contact_address_matchlist"),
            pl.col("contaddr_smempladdr_334L").sum().alias("sum_contact_same_employer_address"),
            pl.col("personindex_1023L").n_unique().alias("unique_person_indices"),
            pl.col("ordinal_employedfrom_271D").max().alias("latest_employment_date_ordinal")
        ])

        self.train_debitcard_1 = self.train_debitcard_1.with_columns([
            DataPipeline_Depth_1.convert_to_ordinal(
                pl.col("openingdate_857D").str.strptime(pl.Date, "%Y-%m-%d")
            ).alias("ordinal_openingdate")
        ])

        self.train_debitcard_1 = self.train_debitcard_1.group_by("case_id").agg([
            pl.col("last180dayaveragebalance_704A").sum().alias("total_180dayaveragebalance"),
            pl.col("last180dayturnover_1134A").sum().alias("total_180dayturnover"),
            pl.col("last30dayturnover_651A").sum().alias("total_30dayturnover"),
            pl.min("ordinal_openingdate").alias("earliest_openingdate")
        ])

        self.train_tax_registry_a_1 = self.train_tax_registry_a_1.with_columns([
            DataPipeline_Depth_1.convert_to_ordinal(
                pl.col("recorddate_4527225D").str.strptime(pl.Date, "%Y-%m-%d")
            ).alias("ordinal_recorddate_4527225D")
        ])

        self.train_tax_registry_a_1 = self.train_tax_registry_a_1.group_by("case_id").agg([
            pl.col("amount_4527230A").sum().alias("total_amount_4527230A"),
            pl.col("ordinal_recorddate_4527225D").max().alias("ordinal_recorddate_4527225D"),
            pl.col("name_4527232M").n_unique().alias("unique_name_4527232M")
        ])

        self.train_tax_registry_b_1 = self.train_tax_registry_b_1.group_by("case_id").agg([
            pl.col("amount_4917619A").sum().alias("total_amount_4917619A"),
            pl.col("deductiondate_4917603D").n_unique().alias("unique_deductiondate_4917603D"),
            pl.col("name_4917606M").n_unique().alias("unique_name_4917606M")
        ])

        self.train_tax_registry_c_1 = self.train_tax_registry_c_1.group_by("case_id").agg([
            pl.col("pmtamount_36A").sum().alias("total_pmtamount_36A"),
            pl.col("processingdate_168D").n_unique().alias("unique_processingdate_168D"),
            pl.col("employername_160M").n_unique().alias("unique_employername_160M")
        ])

        date_columns_2 = [column for column in train_credit_bureau_a_1.columns if column[-1] == 'D']
        amount_column = [column for column in train_credit_bureau_a_1.columns if column[-1] == 'A']
        dpd_column = [column for column in train_credit_bureau_a_1.columns if column[-1] == 'P']
        mask_column = [column for column in train_credit_bureau_a_1.columns if column[-1] == 'M']

        for col in date_columns_2:
            train_credit_bureau_a_1 = train_credit_bureau_a_1.with_columns(
                DataPipeline_Depth_1.convert_to_ordinal(pl.col(col)).alias(f"ordinal_{col}")
                )
        
        aggregations = []

        for col in train_credit_bureau_a_1.columns:
            if col in amount_column:
                aggregations.append(pl.col(col).sum().alias(f"total_{col}"))
            elif col in dpd_column:
                aggregations.append(pl.col(col).sum().alias(f"total_{col}"))
            elif col in mask_column:
                aggregations.append(pl.col(col).n_unique().alias(f"unique_{col}"))
            else:
                aggregations.append(pl.col(col).n_unique().alias(f"unique_{col}"))

        self.train_credit_bureau_b_1 = self.train_credit_bureau_b_1.group_by("case_id").agg(aggregations)

        date_columns_3 = [column for column in train_credit_bureau_b_1.columns if column[-1] == 'D']
        amount_column_2 = [column for column in train_credit_bureau_b_1.columns if column[-1] == 'A']
        dpd_column_2 = [column for column in train_credit_bureau_b_1.columns if column[-1] == 'P']
        mask_column_2 = [column for column in train_credit_bureau_b_1.columns if column[-1] == 'M']

        for col in date_columns_3:
            train_credit_bureau_b_1 = train_credit_bureau_b_1.with_columns(
                DataPipeline_Depth_1.convert_to_ordinal(pl.col(col)).alias(f"ordinal_{col}")
                )
        
        aggregations_2 = []

        for col in train_credit_bureau_b_1.columns:
            if col in amount_column_2:
                aggregations.append(pl.col(col).sum().alias(f"total_{col}"))
            elif col in dpd_column_2:
                aggregations.append(pl.col(col).sum().alias(f"total_{col}"))
            elif col in mask_column_2:
                aggregations.append(pl.col(col).n_unique().alias(f"unique_{col}"))
            else:
                aggregations.append(pl.col(col).n_unique().alias(f"unique_{col}"))

        self.train_credit_bureau_b_1 = self.train_credit_bureau_b_1.group_by("case_id").agg(aggregations_2)

    def merge_data(self):
        df_joined = self.train_applprev_1.join(self.train_other_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_deposit_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_person_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_debitcard_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_tax_registry_a_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_tax_registry_b_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_tax_registry_c_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_credit_bureau_a_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_credit_bureau_b_1, on="case_id", how="left")
        
        duration_columns = ["approval_to_activation_min_diff", "creation_min_diff", "payment_max_diff"]
        for column in duration_columns:
            df_joined = df_joined.with_columns(
                pl.col(column).str.replace("d", "").cast(pl.Int64) * 1440
            )

        column_names = df_joined.columns
        column_types = df_joined.dtypes
        date_columns = [name for name, dtype in zip(column_names, column_types) if dtype == pl.Date]
        for col in date_columns:
            df_joined = df_joined.with_columns(
                DataPipeline_Depth_1.convert_to_ordinal(pl.col(col)).alias(col)
            )

        return df_joined.collect()

    def execute_pipeline(self):
        self.load_data()
        self.preprocess_data()
        merged_data = self.merge_data()
        self.save_schema()
        return merged_data

if __name__ == "__main__":
    applprev_paths = [
        "Data/parquet_files/train/train_applprev_1_0.parquet",
        "Data/parquet_files/train/train_applprev_1_1.parquet"
    ]
    other_path = "Data/parquet_files/train/train_other_1.parquet"
    deposit_path = "Data/parquet_files/train/train_deposit_1.parquet"
    person_path = "Data/parquet_files/train/train_person_1.parquet"
    debitcard_path = "Data/parquet_files/train/train_debitcard_1.parquet"
    tax_registry_a_1_path = "Data/parquet_files/train/train_tax_registry_a_1.parquet"
    tax_registry_b_1_path = "Data/parquet_files/train/train_tax_registry_b_1.parquet"
    tax_registry_c_1_path = "Data/parquet_files/train/train_tax_registry_c_1.parquet"
    credit_bureau_a_1_paths = [
        "Data/parquet_files/train/train_credit_bureau_a_1_0.parquet",
        "Data/parquet_files/train/train_credit_bureau_a_1_1.parquet",
        "Data/parquet_files/train/train_credit_bureau_a_1_2.parquet",
        "Data/parquet_files/train/train_credit_bureau_a_1_3.parquet"
    ]
    credit_bureau_b_1_path = "Data/parquet_files/train/train_credit_bureau_b_1.parquet"
    schema_path = "unified_schema_2.json"

    pipeline = DataPipeline_Depth_1(applprev_paths, other_path, deposit_path, person_path, debitcard_path, tax_registry_a_1_path, tax_registry_b_1_path, 
                                    tax_registry_c_1_path, credit_bureau_a_1_paths, credit_bureau_b_1_path, schema_path)
    Depth_1 = pipeline.execute_pipeline()
    
class DataPipeline_Depth_1:
    def __init__(self, applprev_paths, other_path, deposit_path, person_path, debitcard_path, schema_path):
        self.applprev_paths = applprev_paths
        self.other_path = other_path
        self.deposit_path = deposit_path
        self.person_path = person_path
        self.debitcard_path = debitcard_path
        self.schema_path = schema_path
        self.global_schema = self.load_schema()

    def load_schema(self):
        with open(self.schema_path, 'r') as file:
            return json.load(file)
    
    @staticmethod
    def dtype_mapping(dtype_str):
        mapping = {
            'Int32': pl.Int32,
            'Int64': pl.Int64,
            'Float32': pl.Float32,
            'Float64': pl.Float64,
            'Utf8': pl.Utf8,
            'Boolean': pl.Boolean,
            'Date': pl.Date,
            'Categorical': pl.Categorical
        }
        return mapping.get(dtype_str, pl.Utf8)

    @staticmethod
    def try_parse_date(col, fmt1, fmt2):
        date1 = col.str.strptime(pl.Date, fmt1, strict=False)
        date2 = col.str.strptime(pl.Date, fmt2, strict=False)
        return pl.when(date1.is_not_null()).then(date1).otherwise(date2)

    @staticmethod
    def convert_to_ordinal(date):
        return pl.when(date.is_not_null()).then(
            (date.dt.year() * 365) + (date.dt.month() * 30) + date.dt.day()
        ).otherwise(None)
    
    def load_and_ensure_schema(self, path):
        df = pl.read_parquet(path)
        for col, expected_dtype in self.global_schema.items():
            expected_pl_dtype = self.dtype_mapping(expected_dtype)
            if col in df.columns:
                if df[col].dtype != expected_pl_dtype:
                    df = df.with_columns(df[col].cast(expected_pl_dtype))
            else:
                df = df.with_columns(pl.lit(None).cast(expected_pl_dtype))
        return df

    def load_data(self):
        self.train_applprev_1 = pl.concat([self.load_and_ensure_schema(path) for path in self.applprev_paths]).lazy()
        self.train_other_1 = self.load_and_ensure_schema(self.other_path).lazy()
        self.train_deposit_1 = self.load_and_ensure_schema(self.deposit_path).lazy()
        self.train_person_1 = self.load_and_ensure_schema(self.person_path).lazy()
        self.train_debitcard_1 = self.load_and_ensure_schema(self.debitcard_path).lazy()

    def preprocess_data(self):

        date_formats = ("%m/%d/%Y", "%Y-%m-%d")
        date_columns = ["approvaldate_319D", "dateactivated_425D", "creationdate_885D", "dtlastpmt_581D", "employedfrom_700D", "dtlastpmtallstes_3545839D", "firstnonzeroinstldate_307D"]
        self.train_applprev_1 = self.train_applprev_1.with_columns([
            DataPipeline_Depth_1.try_parse_date(pl.col(col), *date_formats).alias(col) for col in date_columns
        ]).group_by("case_id").agg([
            pl.col("actualdpd_943P").mean().alias("actualdpd_943P_mean"),
            pl.col("annuity_853A").sum().alias("annuity_853A_sum"),
            pl.col("childnum_21L").sum().alias("childnum_21L_sum"),
            pl.col("credacc_actualbalance_314A").mean().alias("credacc_actualbalance_314A_mean"),
            pl.col("credacc_credlmt_575A").mean().alias("credacc_credlmt_575A_mean"),
            pl.col("credacc_maxhisbal_375A").max().alias("credacc_maxhisbal_375A_max"),
            pl.col("credacc_minhisbal_90A").min().alias("credacc_minhisbal_90A_min"),
            pl.col("credacc_transactions_402L").sum().alias("credacc_transactions_402L_sum"),
            pl.col("credamount_590A").mean().alias("credamount_590A_mean"),
            pl.col("currdebt_94A").mean().alias("currdebt_94A_mean"),
            pl.col("downpmt_134A").sum().alias("downpmt_134A_sum"),
            pl.col("mainoccupationinc_437A").mean().alias("mainoccupationinc_437A_mean"),
            pl.col("outstandingdebt_522A").sum().alias("outstandingdebt_522A_sum"),
            pl.col("pmtnum_8L").max().alias("pmtnum_8L_max"),
            pl.col("tenor_203L").min().alias("tenor_203L_min"),
            pl.col("isbidproduct_390L").cast(pl.UInt32).sum().alias("isbidproduct_390L_sum"),
            pl.col("isdebitcard_527L").cast(pl.UInt32).sum().alias("isdebitcard_527L_sum"),
            pl.col("credacc_status_367L").n_unique().alias("credacc_status_367L_n_unique"),
            pl.col("credtype_587L").n_unique().alias("credtype_587L_n_unique"),
            pl.col("education_1138M").n_unique().alias("education_1138M_n_unique"),
            pl.col("familystate_726L").n_unique().alias("familystate_726L_n_unique"),
            pl.col("postype_4733339M").n_unique().alias("postype_4733339M_n_unique"),
            pl.col("profession_152M").n_unique().alias("profession_152M_n_unique"),
            pl.col("rejectreason_755M").n_unique().alias("rejectreason_755M_n_unique"),
            pl.col("rejectreasonclient_4145042M").n_unique().alias("rejectreasonclient_4145042M_n_unique"),
            pl.col("status_219L").n_unique().alias("status_219L_n_unique"),
            (pl.col("approvaldate_319D").diff().abs().min()).alias("approval_to_activation_min_diff"),
            (pl.col("creationdate_885D").diff().abs().min()).alias("creation_min_diff"),
            (pl.col("dtlastpmt_581D").diff().abs().max()).alias("payment_max_diff"),
            pl.col("employedfrom_700D").min().alias("earliest_employment_date"),
            pl.col("byoccupationinc_3656910L").n_unique().alias("byoccupationinc_3656910L_n_unique"),
            pl.col("cancelreason_3545846M").n_unique().alias("cancelreason_3545846M_n_unique"),
            pl.col("district_544M").n_unique().alias("district_544M_n_unique"),
            pl.col("dtlastpmtallstes_3545839D").min().alias("earliest_last_payment_date"),
            pl.col("firstnonzeroinstldate_307D").min().alias("earliest_first_nonzero_installment_date"),
            pl.col("inittransactioncode_279L").n_unique().alias("inittransactioncode_279L_n_unique"),
            pl.col("maxdpdtolerance_577P").max().alias("maximum_dpd_tolerance"),
            pl.col("revolvingaccount_394A").sum().alias("sum_revolving_accounts")
        ])

        self.train_other_1 = self.train_other_1.group_by("case_id").agg([
            pl.col("amtdebitincoming_4809443A").sum().alias("sum_amtdebitincoming"),
            pl.col("amtdebitoutgoing_4809440A").sum().alias("sum_amtdebitoutgoing"),
            pl.col("amtdepositbalance_4809441A").mean().alias("avg_amtdepositbalance"),
            pl.col("amtdepositincoming_4809444A").sum().alias("sum_amtdepositincoming"),
            pl.col("amtdepositoutgoing_4809442A").sum().alias("sum_amtdepositoutgoing")
        ])

        self.train_deposit_1 = self.train_deposit_1.group_by("case_id").agg([
            pl.col("amount_416A").mean().alias("average_amount"),
            pl.count("openingdate_313D").alias("open_contracts_count"),
            pl.count("contractenddate_991D").alias("closed_contracts_count")
        ])

        date_format = ("%m/%d/%Y", "%Y-%m-%d")
        self.train_person_1 = self.train_person_1.with_columns(
            DataPipeline_Depth_1.try_parse_date(pl.col("empl_employedfrom_271D"), *date_format).alias("empl_employedfrom_271D")
        )
        self.train_person_1 = self.train_person_1.with_columns(
            DataPipeline_Depth_1.convert_to_ordinal(pl.col("empl_employedfrom_271D")).alias("ordinal_employedfrom_271D")
        )
        self.train_person_1 = self.train_person_1.group_by("case_id").agg([
            pl.col("birth_259D").n_unique().alias("unique_birth_dates"),
            pl.col("birthdate_87D").n_unique().alias("unique_birth_dates_87D"),
            pl.col("childnum_185L").max().alias("max_children"),
            pl.col("education_927M").n_unique().alias("unique_educations"),
            pl.col("empl_employedtotal_800L").n_unique().alias("avg_employment_length"),
            pl.col("mainoccupationinc_384A").sum().alias("total_main_income"),
            pl.col("gender_992L").n_unique().alias("unique_genders"),
            pl.col("housetype_905L").n_unique().alias("unique_house_types"),
            pl.col("housingtype_772L").n_unique().alias("unique_housing_types"),
            pl.col("incometype_1044T").n_unique().alias("unique_income_types"),
            pl.col("maritalst_703L").n_unique().alias("unique_marital_statuses"),
            pl.col("persontype_1072L").n_unique().alias("unique_person_types_1072L"),
            pl.col("persontype_792L").n_unique().alias("unique_person_types_792L"),
            pl.col("relationshiptoclient_415T").n_unique().alias("unique_relationships_415T"),
            pl.col("relationshiptoclient_642T").n_unique().alias("unique_relationships_642T"),
            pl.col("remitter_829L").sum().alias("sum_remitters"),
            pl.col("role_1084L").n_unique().alias("unique_roles_1084L"),
            pl.col("role_993L").n_unique().alias("unique_roles_993L"),
            pl.col("safeguarantyflag_411L").sum().alias("sum_safeguaranty_flags"),
            pl.col("sex_738L").n_unique().alias("unique_sexes"),
            pl.col("type_25L").n_unique().alias("unique_contact_types"),
            pl.col("contaddr_district_15M").n_unique().alias("unique_contact_address_districts"),
            pl.col("empladdr_district_926M").n_unique().alias("unique_employer_address_districts"),
            pl.col("registaddr_district_1083M").n_unique().alias("unique_registered_address_districts"),
            pl.col("isreference_387L").sum().alias("sum_is_reference_flags"),
            pl.col("empl_industry_691L").n_unique().alias("unique_industries"),
            pl.col("empladdr_zipcode_114M").n_unique().alias("unique_employer_zipcodes"),
            pl.col("contaddr_zipcode_807M").n_unique().alias("unique_contact_zipcodes"),
            pl.col("registaddr_zipcode_184M").n_unique().alias("unique_registered_zipcodes"),
            pl.col("language1_981M").n_unique().alias("unique_languages"),
            pl.col("familystate_447L").n_unique().alias("unique_family_states"),
            pl.col("contaddr_matchlist_1032L").sum().alias("sum_contact_address_matchlist"),
            pl.col("contaddr_smempladdr_334L").sum().alias("sum_contact_same_employer_address"),
            pl.col("personindex_1023L").n_unique().alias("unique_person_indices"),
            pl.col("ordinal_employedfrom_271D").max().alias("latest_employment_date_ordinal")
        ])

        self.train_debitcard_1 = self.train_debitcard_1.with_columns([
            DataPipeline_Depth_1.convert_to_ordinal(
                pl.col("openingdate_857D").str.strptime(pl.Date, "%Y-%m-%d")
            ).alias("ordinal_openingdate")
        ])

        self.train_debitcard_1 = self.train_debitcard_1.group_by("case_id").agg([
            pl.col("last180dayaveragebalance_704A").sum().alias("total_180dayaveragebalance"),
            pl.col("last180dayturnover_1134A").sum().alias("total_180dayturnover"),
            pl.col("last30dayturnover_651A").sum().alias("total_30dayturnover"),
            pl.min("ordinal_openingdate").alias("earliest_openingdate")
        ])

    def merge_data(self):
        df_joined = self.train_applprev_1.join(self.train_other_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_deposit_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_person_1, on="case_id", how="left")
        df_joined = df_joined.join(self.train_debitcard_1, on="case_id", how="left")
        
        duration_columns = ["approval_to_activation_min_diff", "creation_min_diff", "payment_max_diff"]
        for column in duration_columns:
            df_joined = df_joined.with_columns(
                pl.col(column).str.replace("d", "").cast(pl.Int64) * 1440
            )

        column_names = df_joined.columns
        column_types = df_joined.dtypes
        date_columns = [name for name, dtype in zip(column_names, column_types) if dtype == pl.Date]
        for col in date_columns:
            df_joined = df_joined.with_columns(
                DataPipeline_Depth_1.convert_to_ordinal(pl.col(col)).alias(col)
            )

        return df_joined.collect()

    def execute_pipeline(self):
        self.load_data()
        self.preprocess_data()
        return self.merge_data()

if __name__ == "__main__":
    applprev_paths = [
        "Data/parquet_files/test/test_applprev_1_0.parquet",
        "Data/parquet_files/test/test_applprev_1_1.parquet",
        "Data/parquet_files/test/test_applprev_1_2.parquet"
    ]
    other_path = "Data/parquet_files/test/test_other_1.parquet"
    deposit_path = "Data/parquet_files/test/test_deposit_1.parquet"
    person_path = "Data/parquet_files/test/test_person_1.parquet"
    debitcard_path = "Data/parquet_files/test/test_debitcard_1.parquet"
    schema_path = "unified_schema_2.json"

    pipeline = DataPipeline_Depth_1(applprev_paths, other_path, deposit_path, person_path, debitcard_path, schema_path)
    Depth_1_test = pipeline.execute_pipeline()

























