In [None]:
import pandas as pd
import polars as pl
from datetime import datetime

def convert_to_ordinal(date_column, date_format):
    epoch_start = pl.lit(datetime(1970, 1, 1))
    date_parsed = date_column.str.strptime(pl.Date, date_format, strict=False)
    return pl.when(date_parsed.is_not_null()).then(
        (date_parsed - epoch_start).dt.total_days()
    ).otherwise(pl.lit(None))

class DataPipeline_Depth_0:
    def __init__(self, base_path, static_paths, static_cb_0_path):
        self.base_path = base_path
        self.static_paths = static_paths
        self.static_cb_0_path = static_cb_0_path

    def load_data(self, path):
        try:
            if isinstance(path, list):
                # Load and concatenate multiple datasets handling schema alignment
                return pl.concat([self.adjust_schema(pl.read_parquet(p)) for p in path], how='vertical')
            return self.adjust_schema(pl.read_parquet(path))
        except Exception as e:
            print(f"Error loading data from {path}: {e}")
            raise

    def adjust_schema(self, df):
        # Define the expected schema from training data or predefined
        expected_schema = {
            # Assuming schema defined elsewhere or load from training metadata
        }
        # Adjust the schema dynamically based on expected schema
        for col, dtype in expected_schema.items():
            if col not in df.columns:
                df = df.with_column(pl.lit(None, dtype=dtype).alias(col))
            else:
                if df[col].dtype != dtype:
                    df = df.with_column(df[col].cast(dtype))
        return df

    def preprocess_base(self, data):
        data = data.with_columns(
            convert_to_ordinal(pl.col('date_decision'), '%Y-%m-%d').alias('date_decision_ordinal')
        )
        data = data.drop(['date_decision'])
        return data

    def preprocess_static(self, data):
        date_columns = [col for col in data.columns if col.endswith('D') and data[col].dtype == pl.Utf8]
        for col in date_columns:
            data = data.with_columns(
                convert_to_ordinal(pl.col(col), '%Y-%m-%d').alias(col)
            )
        for col in data.columns:
            if data[col].dtype == pl.Boolean:
                data = data.with_columns(data[col].cast(pl.Int32).alias(col))
        columns_to_keep = [col for col in data.columns if data[col].dtype != pl.Utf8 or col in date_columns]
        data = data.select(columns_to_keep)
        return data

    def preprocess_static_cb_0(self, data):
        return self.preprocess_static(data)

    def merge_data(self, data_base, data_static, data_static_cb_0):
        merged_data = data_base.join(data_static, on='case_id', how='left')
        merged_data = merged_data.join(data_static_cb_0, on='case_id', how='left')
        for col in ['education_1103M', 'maritalst_385M']:
            if merged_data[col].dtype != pl.Categorical:
                merged_data = merged_data.with_columns(merged_data[col].cast(pl.Categorical))
        dummies = merged_data[['education_1103M', 'maritalst_385M']].to_dummies()
        merged_data = merged_data.drop(['education_1103M', 'maritalst_385M'])
        merged_data = pl.concat([merged_data, dummies], how='horizontal')
        return merged_data

    def execute_pipeline(self):
        data_base = self.load_data(self.base_path)
        data_static = pl.concat([self.load_data(p) for p in self.static_paths], how='vertical')
        data_static_cb_0 = self.load_data(self.static_cb_0_path)
        data_base = self.preprocess_base(data_base)
        data_static = self.preprocess_static(data_static)
        data_static_cb_0 = self.preprocess_static_cb_0(data_static_cb_0)
        Depth_0 = self.merge_data(data_base, data_static, data_static_cb_0)
        return Depth_0

if __name__ == "__main__":
    base_path = "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_base.parquet"
    static_paths = [
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_0_0.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_0_1.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_0_2.parquet"
    ]
    static_cb_0_path = "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_cb_0.parquet"

    pipeline = DataPipeline_Depth_0(base_path, static_paths, static_cb_0_path)
    Depth_0 = pipeline.execute_pipeline()
