In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
import os

api = KaggleApi()
api.authenticate()

competition_name = 'home-credit-credit-risk-model-stability'
# api.competition_download_files(competition_name, path='./Data/')

zip_file_path = f'./Data/{competition_name}.zip'

# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extractall('./Data/')

# os.remove(zip_file_path)

In [2]:
import polars as pl

train_base = pl.read_csv("Data/csv_files/train/train_base.csv")

train_static_0_0 = pl.read_csv("Data/csv_files/train/train_static_0_0.csv")
train_static_0_1 = pl.read_csv("Data/csv_files/train/train_static_0_1.csv")

In [3]:
columns_to_float = [
    'amtinstpaidbefduel24m_4187115A',
    'avgdbddpdlast3m_4187120P',
    'avgdbdtollast24m_4525197P',
    'avglnamtstart24m_4525187A',
    'avgoutstandbalancel6m_4187114A',
    'avgpmtlast12m_4525200A',
    'maxlnamtstart6m_4525199A',
    'maxoutstandbalancel12m_4187113A',
    'maxpmtlast3m_4525190A',
    'mindbdtollast24m_4525191P',
    'numinstlswithdpd5_4187116L',
    'numinstmatpaidtearly2d_4499204L',
    'numinstpaid_4499208L',
    'numinstpaidearly3dest_4493216L',
    'numinstpaidearly5dest_4493211L',
    'numinstpaidearly5dobd_4499205L',
    'numinstpaidearlyest_4493214L',
    'numinstpaidlastcontr_4325080L',
    'numinstregularpaidest_4493210L',
    'numinsttopaygrest_4493213L',
    'numinstunpaidmaxest_4493212L',
    'sumoutstandtotalest_4493215A',
    'totinstallast1m_4525188A'
]

special_casts = {
    'clientscnt_136L': pl.Float64,
    'maxdbddpdtollast6m_4187119P': pl.Float64
}

def convert_columns_to_float(df, columns):
    for column in columns:
        if column in df.columns:
            df = df.with_columns(pl.col(column).cast(pl.Float64).alias(column))
    return df

def convert_columns(df, columns_to_float, special_casts):
    for column in columns_to_float:
        if column in df.columns:
            df = df.with_columns(pl.col(column).cast(pl.Float64).alias(column))
    for column, dtype in special_casts.items():
        if column in df.columns:
            df = df.with_columns(pl.col(column).cast(dtype).alias(column))
    return df

train_static_0_0_converted = convert_columns(train_static_0_0, columns_to_float, special_casts)
train_static_0_1_converted = convert_columns(train_static_0_1, columns_to_float, special_casts)

In [4]:
train_static_0_1_converted = train_static_0_1_converted.with_columns(
    pl.when(pl.col("isbidproductrequest_292L") == "true")
    .then(True)
    .when(pl.col("isbidproductrequest_292L") == "false")
    .then(False)
    .otherwise(pl.col("isbidproductrequest_292L").is_null())
    .alias("isbidproductrequest_292L")
)

train_static_0_1_converted = train_static_0_1_converted.with_columns(
    pl.col("isbidproductrequest_292L").cast(pl.Boolean).alias("isbidproductrequest_292L")
)

In [5]:
train_static_0 = pl.concat([train_static_0_0_converted, train_static_0_1_converted])

In [6]:
train_static_cb_0 = pl.read_csv("Data/csv_files/train/train_static_cb_0.csv")

In [7]:
train_static = train_static_0.join(
    train_static_cb_0,
    on="case_id",
    how="left"
)

In [8]:
train_DEPTH_0 = train_base.join(
    train_static,
    on="case_id",
    how="outer"
)

In [9]:
train_DEPTH_0.shape

(1526659, 225)

In [10]:
train_DEPTH_0.head()

case_id,date_decision,MONTH,WEEK_NUM,target,case_id_right,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,…,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
i64,str,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,str,str,str,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64
0,"""2019-01-03""",201901,0,0,0,,,1917.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,"""2019-01-03""",201901,0,0,1,,,3134.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,"""2019-01-04""",201901,0,0,2,,,4937.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,"""2019-01-03""",201901,0,0,3,,,4643.6,0.0,0.0,1.0,0.0,2.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,"""2019-01-04""",201901,0,1,4,,,3390.2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
directory = r"C:\Users\afise\.git\CreditRiskModel\Merged_Data"
filename = "train_DEPTH_0.csv"
file_path = os.path.join(directory, filename)

if not os.path.exists(directory):
    os.makedirs(directory)

train_DEPTH_0.write_csv(file_path)