In [1]:
import pandas as pd
import polars as pl
import lightgbm as lgb
import matplotlib.pyplot as plt
import json
from datetime import datetime
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve, roc_auc_score

def convert_to_ordinal(date_column, date_format):
    epoch_start = datetime(1970, 1, 1)
    date_parsed = date_column.str.strptime(pl.Date, date_format, strict=False)
    return pl.when(date_parsed.is_not_null()).then(
        (date_parsed - epoch_start).dt.total_days()
    ).otherwise(None)

class DataPipeline_Depth_0:
    def __init__(self, base_path, static_0_0_path, static_0_1_path, static_cb_0_path, schema_path):
        self.base_path = base_path
        self.static_0_0_path = static_0_0_path
        self.static_0_1_path = static_0_1_path
        self.static_cb_0_path = static_cb_0_path
        self.schema_path = schema_path
        self.global_schema = {}

    def load_data(self, path):
        try:
            df = pl.read_parquet(path)
            self.update_schema(df)
            return df
        except Exception as e:
            print(f"Error loading data from {path}: {e}")
            raise

    def update_schema(self, dataframe):
        for col, dtype in zip(dataframe.columns, dataframe.dtypes):
            if col not in self.global_schema:
                self.global_schema[col] = str(dtype)

    def save_schema(self):
        with open(self.schema_path, 'w') as file:
            json.dump(self.global_schema, file)

    def preprocess_base(self, data):
        data = data.with_columns(
            convert_to_ordinal(pl.col('date_decision'), '%Y-%m-%d').alias('date_decision_ordinal')
        )
        data = data.drop(['date_decision'])
        return data

    def preprocess_static(self, data):
        date_columns = [col for col in data.columns if col.endswith('D') and data[col].dtype == pl.Utf8]
        for col in date_columns:
            data = data.with_columns(
                convert_to_ordinal(pl.col(col), '%Y-%m-%d').alias(col)
            )
        for col in data.columns:
            if data[col].dtype == pl.Boolean:
                data = data.with_columns(data[col].cast(pl.Int32).alias(col))
        return data.select([col for col in data.columns if data[col].dtype != pl.Utf8 or col in date_columns])

    def preprocess_static_cb_0(self, data):
        date_columns = [col for col in data.columns if col.endswith('D') and data[col].dtype == pl.Utf8]
        for col in date_columns:
            data = data.with_columns(
                convert_to_ordinal(pl.col(col), '%Y-%m-%d').alias(col)
            )
        categorical_columns = ['education_1103M', 'maritalst_385M']
        for col in categorical_columns:
            if col in data.columns:
                data = data.with_columns(data[col].cast(pl.Categorical))
        columns_to_drop = [col for col in data.columns if data[col].dtype == pl.Utf8 and col not in date_columns and col not in categorical_columns]
        data = data.drop(columns_to_drop)
        return data

    def merge_data(self, data_base, data_static_0_0, data_static_0_1, data_static_cb_0):
        concatenated_data = pl.concat([data_static_0_0, data_static_0_1], how='vertical')
        merged_data = data_base.join(concatenated_data, on='case_id', how='left')
        merged_data = merged_data.join(data_static_cb_0, on='case_id', how='left')
        for col in ['education_1103M', 'maritalst_385M']:
            if merged_data[col].dtype != pl.Categorical:
                merged_data = merged_data.with_columns(merged_data[col].cast(pl.Categorical))
        dummies = merged_data[['education_1103M', 'maritalst_385M']].to_dummies()
        merged_data = merged_data.drop(['education_1103M', 'maritalst_385M'])
        merged_data = pl.concat([merged_data, dummies], how='horizontal')
        return merged_data

    def execute_pipeline(self):
        data_base = self.load_data(self.base_path)
        data_static_0_0 = self.load_data(self.static_0_0_path)
        data_static_0_1 = self.load_data(self.static_0_1_path)
        data_static_cb_0 = self.load_data(self.static_cb_0_path)
        self.save_schema()
        data_base = self.preprocess_base(data_base)
        data_static_0_0 = self.preprocess_static(data_static_0_0)
        data_static_0_1 = self.preprocess_static(data_static_0_1)
        data_static_cb_0 = self.preprocess_static_cb_0(data_static_cb_0)
        return self.merge_data(data_base, data_static_0_0, data_static_0_1, data_static_cb_0)

if __name__ == "__main__":
    schema_path = "C:/Users/afise/.git/CreditRiskModel/unified_schema.json"
    pipeline = DataPipeline_Depth_0(
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/train/train_base.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/train/train_static_0_0.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/train/train_static_0_1.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/train/train_static_cb_0.parquet",
        schema_path
    )
    Depth_0 = pipeline.execute_pipeline()

In [2]:
import polars as pl
import json
from datetime import datetime

def convert_to_ordinal(date_column, date_format):
    epoch_start = datetime(1970, 1, 1)
    date_parsed = date_column.str.strptime(pl.Date, date_format, strict=False)
    return pl.when(date_parsed.is_not_null()).then(
        (date_parsed - epoch_start).dt.total_days()
    ).otherwise(None)

def dtype_mapping(dtype_str):
    mapping = {
        'Int32': pl.Int32,
        'Int64': pl.Int64,
        'Float32': pl.Float32,
        'Float64': pl.Float64,
        'Utf8': pl.Utf8,
        'Boolean': pl.Boolean,
        'Date': pl.Date,
        'Categorical': pl.Categorical
    }
    return mapping.get(dtype_str, pl.Utf8)

class DataPipeline_Depth_0:
    def __init__(self, base_path, static_paths, static_cb_0_path, schema_path):
        self.base_path = base_path
        self.static_paths = static_paths
        self.static_cb_0_path = static_cb_0_path
        self.schema_path = schema_path
        self.global_schema = self.load_schema()

    def load_schema(self):
        with open(self.schema_path, 'r') as file:
            return json.load(file)

    def load_data(self, path):
        df = pl.read_parquet(path)
        return self.ensure_schema(df)

    def ensure_schema(self, dataframe):
        for col, expected_dtype in self.global_schema.items():
            expected_pl_dtype = dtype_mapping(expected_dtype)
            if col in dataframe.columns:
                if dataframe[col].dtype != expected_pl_dtype:
                    dataframe = dataframe.with_columns(dataframe[col].cast(expected_pl_dtype))
            else:
                dataframe = dataframe.with_columns(pl.lit(None).cast(expected_pl_dtype))
        return dataframe



    def preprocess_base(self, data):
        data = data.with_columns(
            convert_to_ordinal(pl.col('date_decision'), '%Y-%m-%d').alias('date_decision_ordinal')
        )
        data = data.drop(['date_decision'])
        return data

    def preprocess_static(self, data):
        date_columns = [col for col in data.columns if col.endswith('D') and data[col].dtype == pl.Utf8]
        for col in date_columns:
            data = data.with_columns(
                convert_to_ordinal(pl.col(col), '%Y-%m-%d').alias(col)
            )
        for col in data.columns:
            if data[col].dtype == pl.Boolean:
                data = data.with_columns(data[col].cast(pl.Int32).alias(col))
        columns_to_keep = [col for col in data.columns if data[col].dtype != pl.Utf8 or col in date_columns]
        data = data.select(columns_to_keep)
        return data

    def preprocess_static_cb_0(self, data):
        date_columns = [col for col in data.columns if col.endswith('D') and data[col].dtype == pl.Utf8]
        for col in date_columns:
            data = data.with_columns(
                convert_to_ordinal(pl.col(col), '%Y-%m-%d').alias(col)
            )
        categorical_columns = ['education_1103M', 'maritalst_385M']
        for col in categorical_columns:
            if col in data.columns:
                data = data.with_columns(data[col].cast(pl.Categorical))
        columns_to_drop = [col for col in data.columns if data[col].dtype == pl.Utf8 and col not in date_columns and col not in categorical_columns]
        data = data.drop(columns_to_drop)
        return data

    def merge_data(self, data_base, static_datas, data_static_cb_0):
        concatenated_static_data = pl.concat(static_datas, how='vertical')
        merged_data = data_base.join(concatenated_static_data, on='case_id', how='left')
        merged_data = merged_data.join(data_static_cb_0, on='case_id', how='left')
        for col in ['education_1103M', 'maritalst_385M']:
            if merged_data[col].dtype != pl.Categorical:
                merged_data = merged_data.with_columns(merged_data[col].cast(pl.Categorical))
        dummies = merged_data[['education_1103M', 'maritalst_385M']].to_dummies()
        merged_data = merged_data.drop(['education_1103M', 'maritalst_385M'])
        merged_data = pl.concat([merged_data, dummies], how='horizontal')
        return merged_data

    def execute_pipeline(self):
        data_base = self.load_data(self.base_path)
        static_datas = [self.load_data(path) for path in self.static_paths]
        data_static_cb_0 = self.load_data(self.static_cb_0_path)
        data_base = self.preprocess_base(data_base)
        static_datas = [self.preprocess_static(data) for data in static_datas]
        data_static_cb_0 = self.preprocess_static_cb_0(data_static_cb_0)
        return self.merge_data(data_base, static_datas, data_static_cb_0)

if __name__ == "__main__":
    schema_path = "C:/Users/afise/.git/CreditRiskModel/unified_schema.json"
    base_path = "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_base.parquet"
    static_paths = [
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_0_0.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_0_1.parquet",
        "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_0_2.parquet"
    ]
    static_cb_0_path = "C:/Users/afise/.git/CreditRiskModel/Data/parquet_files/test/test_static_cb_0.parquet"
    
    pipeline = DataPipeline_Depth_0(base_path, static_paths, static_cb_0_path, schema_path)
    Depth_0_test = pipeline.execute_pipeline()