In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
import os
import polars as pl

api = KaggleApi()
api.authenticate()

competition_name = 'home-credit-credit-risk-model-stability'

zip_file_path = f'./Data/{competition_name}.zip'

In [2]:
train_base = pl.read_csv("Data/csv_files/train/train_base.csv")
train_static_0_0 = pl.read_csv("Data/csv_files/train/train_static_0_0.csv")
train_static_0_1 = pl.read_csv("Data/csv_files/train/train_static_0_1.csv")

In [3]:
columns_to_float = [
    'amtinstpaidbefduel24m_4187115A',
    'avgdbddpdlast3m_4187120P',
    'avgdbdtollast24m_4525197P',
    'avglnamtstart24m_4525187A',
    'avgoutstandbalancel6m_4187114A',
    'avgpmtlast12m_4525200A',
    'maxlnamtstart6m_4525199A',
    'maxoutstandbalancel12m_4187113A',
    'maxpmtlast3m_4525190A',
    'mindbdtollast24m_4525191P',
    'numinstlswithdpd5_4187116L',
    'numinstmatpaidtearly2d_4499204L',
    'numinstpaid_4499208L',
    'numinstpaidearly3dest_4493216L',
    'numinstpaidearly5dest_4493211L',
    'numinstpaidearly5dobd_4499205L',
    'numinstpaidearlyest_4493214L',
    'numinstpaidlastcontr_4325080L',
    'numinstregularpaidest_4493210L',
    'numinsttopaygrest_4493213L',
    'numinstunpaidmaxest_4493212L',
    'sumoutstandtotalest_4493215A',
    'totinstallast1m_4525188A'
]

special_casts = {
    'clientscnt_136L': pl.Float64,
    'maxdbddpdtollast6m_4187119P': pl.Float64
}

def convert_columns_to_float(df, columns):
    for column in columns:
        if column in df.columns:
            df = df.with_columns(pl.col(column).cast(pl.Float64).alias(column))
    return df

def convert_columns(df, columns_to_float, special_casts):
    for column in columns_to_float:
        if column in df.columns:
            df = df.with_columns(pl.col(column).cast(pl.Float64).alias(column))
    for column, dtype in special_casts.items():
        if column in df.columns:
            df = df.with_columns(pl.col(column).cast(dtype).alias(column))
    return df

train_static_0_0_converted = convert_columns(train_static_0_0, columns_to_float, special_casts)
train_static_0_1_converted = convert_columns(train_static_0_1, columns_to_float, special_casts)

In [4]:
train_static_0_1_converted = train_static_0_1_converted.with_columns(
    pl.when(pl.col("isbidproductrequest_292L") == "true")
    .then(True)
    .when(pl.col("isbidproductrequest_292L") == "false")
    .then(False)
    .otherwise(pl.col("isbidproductrequest_292L").is_null())
    .alias("isbidproductrequest_292L")
)

train_static_0_1_converted = train_static_0_1_converted.with_columns(
    pl.col("isbidproductrequest_292L").cast(pl.Boolean).alias("isbidproductrequest_292L")
)

In [5]:
train_static_0 = pl.concat([train_static_0_0_converted, train_static_0_1_converted])
train_static_cb_0 = pl.read_csv("Data/csv_files/train/train_static_cb_0.csv")

In [6]:
train_static = train_static_0.join(
    train_static_cb_0,
    on="case_id",
    how="left"
)

In [7]:
train_DEPTH_0 = train_base.join(
    train_static,
    on="case_id",
    how="outer"
)

In [8]:
string_columns = [col_name for col_name, dtype in train_DEPTH_0.schema.items() if dtype == pl.Utf8]

In [9]:
import polars as pl

def convert_to_ordinal(date_column):
    min_date = pl.lit("0001-01-01").str.strptime(pl.Date)
    date_parsed = date_column.str.strptime(pl.Date, strict=False)
    return pl.when(date_parsed.is_not_null() & (date_parsed > min_date)) \
             .then((date_parsed.cast(pl.Int64) - min_date.cast(pl.Int64))) \
             .otherwise(pl.lit(None))

def convert_to_float(amount_column):
    return pl.when(amount_column.is_not_null()) \
        .then(amount_column.str.replace(",", "").cast(pl.Float64)) \
        .otherwise(pl.lit(None))

date_columns = [col for col in string_columns if col.endswith('D')] + ['date_decision']
amount_columns = [col for col in string_columns if col.endswith('A')]
one_hot_encode_columns = ['education_1103M', 'maritalst_385M']

for col in date_columns:
    train_DEPTH_0 = train_DEPTH_0.with_columns(
        convert_to_ordinal(pl.col(col)).alias(col)
    )

for col in amount_columns:
    train_DEPTH_0 = train_DEPTH_0.with_columns(
        convert_to_float(pl.col(col)).alias(col)
    )

for col in one_hot_encode_columns:
    dummy_df = train_DEPTH_0.select(col).to_dummies()
    train_DEPTH_0 = train_DEPTH_0.drop(col)
    train_DEPTH_0 = train_DEPTH_0.hstack(dummy_df)

remaining_columns = [
    col for col in string_columns 
    if col not in date_columns 
    and col not in amount_columns 
    and col not in one_hot_encode_columns
]

train_DEPTH_0 = train_DEPTH_0.drop(remaining_columns)

In [10]:
directory = r"C:\Users\afise\.git\CreditRiskModel\Merged_Data"
filename = "train_DEPTH_0.csv"
file_path = os.path.join(directory, filename)

if not os.path.exists(directory):
    os.makedirs(directory)

train_DEPTH_0.write_csv(file_path)