In [None]:
%pip install polars

import pandas as pd
import polars as pl
import lightgbm as lgb
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve, roc_auc_score

def convert_to_ordinal(date_column, date_format):
    epoch_start = pl.lit(datetime(1970, 1, 1))
    date_parsed = date_column.str.strptime(pl.Date, date_format, strict=False)
    return pl.when(date_parsed.is_not_null()).then(
        (date_parsed - epoch_start).dt.total_days()
    ).otherwise(pl.lit(None))

class DataPipeline_Depth_0:
    def __init__(self, base_path, static_paths, static_cb_0_path):
        self.base_path = base_path
        self.static_paths = static_paths
        self.static_cb_0_path = static_cb_0_path

    def load_data(self, path):
        try:
            return pl.read_parquet(path)
        except Exception as e:
            print(f"Error loading data from {path}: {e}")
            raise

    def preprocess_base(self, data):
        data = data.with_columns(
            convert_to_ordinal(pl.col('date_decision'), '%Y-%m-%d').alias('date_decision_ordinal')
        )
        data = data.drop(['date_decision'])
        return data

    def preprocess_static(self):
        data_frames = [self.load_data(path) for path in self.static_paths if path is not None]

        # Find all unique columns across all data frames
        unified_columns = set(col for df in data_frames for col in df.columns)

        # Ensure each DataFrame has all the unified columns
        for df in data_frames:
            for col in unified_columns:
                if col not in df.columns:
                    # Assume string type for missing columns, adjust based on actual needs
                    df[col] = pl.lit("").alias(col) if 'D' not in col else pl.lit(None).cast(pl.Date)

        # Concatenate ensuring all data frames have matching schema
        data = pl.concat(data_frames, how='vertical')

        # Convert dates and handle data types appropriately
        date_columns = [col for col in data.columns if col.endswith('D')]
        for col in date_columns:
            data = data.with_columns(
                convert_to_ordinal(pl.col(col), '%Y-%m-%d').alias(col)
            )
            
        for col in data.columns:
            if data[col].dtype == pl.Boolean:
                data = data.with_columns(data[col].cast(pl.Int32).alias(col))

        # Select only the necessary columns, assuming date columns need to be kept
        columns_to_keep = [col for col in data.columns if data[col].dtype != pl.Utf8 or col in date_columns]
        data = data.select(columns_to_keep)
        return data


    def preprocess_static_cb_0(self, data):
        date_columns = [col for col in data.columns if col.endswith('D') and data[col].dtype == pl.Utf8]
        for col in date_columns:
            data = data.with_columns(
                convert_to_ordinal(pl.col(col), '%Y-%m-%d').alias(col)
            )
        categorical_columns = ['education_1103M', 'maritalst_385M']
        for col in categorical_columns:
            if col in data.columns:
                data = data.with_columns(data[col].cast(pl.Categorical))
        columns_to_drop = [col for col in data.columns if data[col].dtype == pl.Utf8 and col not in date_columns and col not in categorical_columns]
        data = data.drop(columns_to_drop)
        return data

    def merge_data(self, data_base, data_static, data_static_cb_0):
        merged_data = data_base.join(data_static, on='case_id', how='left')
        merged_data = merged_data.join(data_static_cb_0, on='case_id', how='left')
        for col in ['education_1103M', 'maritalst_385M']:
            if merged_data[col].dtype != pl.Categorical:
                merged_data = merged_data.with_columns(merged_data[col].cast(pl.Categorical))
        dummies = merged_data[['education_1103M', 'maritalst_385M']].to_dummies()
        merged_data = merged_data.drop(['education_1103M', 'maritalst_385M'])
        merged_data = pl.concat([merged_data, dummies], how='horizontal')
        return merged_data

    def execute_pipeline(self):
        data_base = self.load_data(self.base_path)
        data_static = self.preprocess_static()
        data_static_cb_0 = self.preprocess_static_cb_0(self.load_data(self.static_cb_0_path))
        Depth_0 = self.merge_data(data_base, data_static, data_static_cb_0)
        return Depth_0

if __name__ == "__main__":
    train_base_path = "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/train/train_base.parquet"
    train_static_paths = [
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/train/train_static_0_0.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/train/train_static_0_1.parquet"
    ]
    train_static_cb_0_path = "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/train/train_static_cb_0.parquet"
    
    test_base_path = "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_base.parquet"
    test_static_paths = [
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_0_0.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_0_1.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_0_2.parquet"
    ]
    test_static_cb_0_path = "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_cb_0.parquet"

    # Initialize pipelines for both training and test data
    train_pipeline = DataPipeline_Depth_0(train_base_path, train_static_paths, train_static_cb_0_path)
    test_pipeline = DataPipeline_Depth_0(test_base_path, test_static_paths, test_static_cb_0_path)
    
    # Execute pipelines
    train_data = train_pipeline.execute_pipeline()
    test_data = test_pipeline.execute_pipeline()
    
    # Output can be handled here, such as saving processed data or further analysis
    print("Training Data Processed:", train_data)
    print("Test Data Processed:", test_data)