In [2]:
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
import os

api = KaggleApi()
api.authenticate()

competition_name = 'home-credit-credit-risk-model-stability'
api.competition_download_files(competition_name, path='./Data/')

zip_file_path = f'./Data/{competition_name}.zip'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('./Data/')

os.remove(zip_file_path)

In [8]:
import polars as pl

train_base = pl.read_csv("Data/csv_files/train/train_base.csv")
test_base = pl.read_csv("Data/csv_files/test/test_base.csv")

train_static_0_0 = pl.read_csv("Data/csv_files/train/train_static_0_0.csv")
train_static_0_1 = pl.read_csv("Data/csv_files/train/train_static_0_1.csv")
test_static_0_0 = pl.read_csv("Data/csv_files/test/test_static_0_0.csv")
test_static_0_1 = pl.read_csv("Data/csv_files/test/test_static_0_1.csv")
test_static_0_2 = pl.read_csv("Data/csv_files/test/test_static_0_2.csv")

In [42]:
columns_to_float = [
    'amtinstpaidbefduel24m_4187115A',
    'avgdbddpdlast3m_4187120P',
    'avgdbdtollast24m_4525197P',
    'avglnamtstart24m_4525187A',
    'avgoutstandbalancel6m_4187114A',
    'avgpmtlast12m_4525200A',
    'maxlnamtstart6m_4525199A',
    'maxoutstandbalancel12m_4187113A',
    'maxpmtlast3m_4525190A',
    'mindbdtollast24m_4525191P',
    'numinstlswithdpd5_4187116L',
    'numinstmatpaidtearly2d_4499204L',
    'numinstpaid_4499208L',
    'numinstpaidearly3dest_4493216L',
    'numinstpaidearly5dest_4493211L',
    'numinstpaidearly5dobd_4499205L',
    'numinstpaidearlyest_4493214L',
    'numinstpaidlastcontr_4325080L',
    'numinstregularpaidest_4493210L',
    'numinsttopaygrest_4493213L',
    'numinstunpaidmaxest_4493212L',
    'sumoutstandtotalest_4493215A',
    'totinstallast1m_4525188A'
]

special_casts = {
    'clientscnt_136L': pl.Float64,
    'maxdbddpdtollast6m_4187119P': pl.Float64
}

def convert_columns_to_float(df, columns):
    for column in columns:
        if column in df.columns:
            df = df.with_columns(pl.col(column).cast(pl.Float64).alias(column))
    return df

def convert_columns(df, columns_to_float, special_casts):
    for column in columns_to_float:
        if column in df.columns:
            df = df.with_columns(pl.col(column).cast(pl.Float64).alias(column))
    for column, dtype in special_casts.items():
        if column in df.columns:
            df = df.with_columns(pl.col(column).cast(dtype).alias(column))
    return df

train_static_0_0_converted = convert_columns(train_static_0_0, columns_to_float, special_casts)
train_static_0_1_converted = convert_columns(train_static_0_1, columns_to_float, special_casts)

In [48]:
train_static_0_1_converted = train_static_0_1_converted.with_columns(
    pl.when(pl.col("isbidproductrequest_292L") == "true")
    .then(True)
    .when(pl.col("isbidproductrequest_292L") == "false")
    .then(False)
    .otherwise(pl.col("isbidproductrequest_292L").is_null())
    .alias("isbidproductrequest_292L")
)

train_static_0_1_converted = train_static_0_1_converted.with_columns(
    pl.col("isbidproductrequest_292L").cast(pl.Boolean).alias("isbidproductrequest_292L")
)

In [49]:
train_static_0_combined = pl.concat([train_static_0_0_converted, train_static_0_1_converted])

In [None]:
train_static_0 = pl.concat([train_static_0_0, train_static_0_1])
test_static_0 = pl.concat([test_static_0_0, test_static_0_1, test_static_0_2])

train_static_cb_0 = pl.read_csv("Data/csv_files/train/train_static_cb_0.csv")
test_static_cb_0 = pl.read_csv("Data/csv_files/train/test_static_cb_0.csv")

In [10]:
print(train_static_0_0.schema)
print(train_static_0_1.schema)

OrderedDict([('case_id', Int64), ('actualdpdtolerance_344P', Float64), ('amtinstpaidbefduel24m_4187115A', String), ('annuity_780A', Float64), ('annuitynextmonth_57A', Float64), ('applicationcnt_361L', Float64), ('applications30d_658L', Float64), ('applicationscnt_1086L', Float64), ('applicationscnt_464L', Float64), ('applicationscnt_629L', Float64), ('applicationscnt_867L', Float64), ('avgdbddpdlast24m_3658932P', Float64), ('avgdbddpdlast3m_4187120P', String), ('avgdbdtollast24m_4525197P', String), ('avgdpdtolclosure24_3658938P', Float64), ('avginstallast24m_3658937A', Float64), ('avglnamtstart24m_4525187A', String), ('avgmaxdpdlast9m_3716943P', Float64), ('avgoutstandbalancel6m_4187114A', String), ('avgpmtlast12m_4525200A', String), ('bankacctype_710L', String), ('cardtype_51L', String), ('clientscnt12m_3712952L', Float64), ('clientscnt3m_3712950L', Float64), ('clientscnt6m_3712949L', Float64), ('clientscnt_100L', Float64), ('clientscnt_1022L', Float64), ('clientscnt_1071L', Float64),