In [99]:
import warnings
import os
from glob import glob
from pathlib import Path
import operator

import pandas as pd
import polars as pl
import polars.selectors as cs
from sklearn import preprocessing
from category_encoders import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder

pd.set_option('display.float_format',lambda x : '%.6f' % x)
warnings.filterwarnings('ignore')

In [52]:
def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2", "target"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("M",):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))

    return df

In [53]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(set_table_dtypes)

    if depth in [1, 2]:
        pass
        # df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        chunks.append(pl.read_parquet(path).pipe(set_table_dtypes))

    df = pl.concat(chunks, how="vertical_relaxed")
    if depth in [1, 2]:
        pass
        # df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    return df

In [54]:
train_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/parquet_files/train"
test_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/parquet_files/test"
train_file = 'train_static_0_*.parquet'
test_file = 'test_static_0_*.parquet'
train_file_path = Path(os.path.join(train_dir_path, train_file))
test_file_path = Path(os.path.join(test_dir_path, test_file))

target_file = 'train_base.parquet'
target_file_path = Path(os.path.join(train_dir_path, target_file))
target_file_test = 'test_base.parquet'
target_file_test_path = Path(os.path.join(test_dir_path, target_file_test))

In [55]:
df_base_train = read_file(target_file_path)
df_train = read_files(train_file_path)
df_train = df_base_train.join(df_train, how="left", on="case_id")
df_train = df_train.with_columns(IS_TRAIN = pl.lit(1))

In [56]:
df_base_test = read_file(target_file_test_path)
df_base_test = df_base_test.with_columns(target = None)
df_test = read_files(test_file_path)
df_test = df_base_test.join(df_test, how="left", on="case_id")
df_test = df_test.with_columns(IS_TRAIN = pl.lit(0))

In [57]:
data = pl.concat([df_train, df_test], how="vertical_relaxed")

In [89]:
del df_train, df_test

In [58]:
def filter_cols(df):
    dt_train = df.filter(pl.col('IS_TRAIN') == 1)
    for col in df.columns:
        if col not in ["case_id", "WEEK_NUM", "date_decision", "MONTH", "IS_TRAIN", "target"]:
            isnull = dt_train[col].is_null().mean()

            if isnull > 0.95:
                print(f'col [{col}] to be null dropped...')
                num_positive = dt_train.filter(pl.col(col).is_not_null() & pl.col("target") == 1).shape[0]
                if num_positive < 1000:
                    df = df.drop(col)
                    print(f'col [{col}] null dropped...')

    for col in df.columns:
        if (col not in ["case_id", "WEEK_NUM", "date_decision", "MONTH", "IS_TRAIN", "target"]) & (df[col].dtype == pl.String):
            freq = dt_train[col].n_unique()

            if freq == 1:
                df = df.drop(col)
                print(f'col [{col}] freq dropped...')

    return df

In [59]:
data = data.pipe(filter_cols)

col [clientscnt_136L] to be null dropped...
col [clientscnt_136L] null dropped...
col [equalityempfrom_62L] to be null dropped...
col [interestrategrace_34L] to be null dropped...
col [interestrategrace_34L] null dropped...
col [isbidproductrequest_292L] to be null dropped...
col [isbidproductrequest_292L] null dropped...
col [lastdependentsnum_448L] to be null dropped...
col [lastdependentsnum_448L] null dropped...
col [lastotherinc_902A] to be null dropped...
col [lastotherinc_902A] null dropped...
col [lastotherlnsexpense_631A] to be null dropped...
col [lastotherlnsexpense_631A] null dropped...
col [lastrepayingdate_696D] to be null dropped...
col [lastrepayingdate_696D] null dropped...
col [maxannuity_4075009A] to be null dropped...
col [payvacationpostpone_4187118D] to be null dropped...
col [payvacationpostpone_4187118D] null dropped...


In [89]:
def handle_date(df):
    for col in df.columns:
        if col[-1] in ("D",):
            print(f'col [{col}] is date...')
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days())

    # df = df.drop("date_decision", "MONTH")
    return df


def handle_category(df):
    dt_train = df.filter(pl.col('IS_TRAIN') == 1)
    cols_list = df.columns
    for col in cols_list:
        if col[-1] in ("M",) and col != 'WEEK_NUM':
            freq = dt_train[col].n_unique()
            if freq <= 10:
                print(f'col [{col}] one hot encoded...')
                encoder = OneHotEncoder(cols=[col], handle_unknown='indicator', handle_missing='indicator', use_cat_names=True)
                encoder.fit(dt_train[col].to_pandas())
                res_df = pl.from_pandas(encoder.transform(df[col].to_pandas()))
                df = pl.concat([df, res_df], how="horizontal")
                df = df.drop(col)
            else:
                print(f'col [{col}] target encoded...')
                encoder = TargetEncoder(cols=[col], handle_unknown='value', handle_missing='value') \
                            .fit(dt_train[col].to_pandas(), dt_train['target'].to_pandas())
                df = df.with_columns(pl.from_pandas(encoder.transform(df[col].to_pandas())).to_series().alias(col + '_target_encoder'))
                df = df.drop(col)

    return df

In [61]:
data = data.pipe(handle_category)

col [lastapprcommoditycat_1041M] target encoded...
col [lastapprcommoditytypec_5251766M] target encoded...
col [lastcancelreason_561M] target encoded...
col [lastrejectcommoditycat_161M] target encoded...
col [lastrejectcommodtypec_5251769M] target encoded...
col [lastrejectreason_759M] target encoded...
col [lastrejectreasonclient_4145040M] target encoded...
col [previouscontdistrict_112M] target encoded...


In [90]:
data = data.pipe(handle_date)

col [datefirstoffer_1144D] is date...
col [datelastinstal40dpd_247D] is date...
col [datelastunpaid_3546854D] is date...
col [dtlastpmtallstes_4499206D] is date...
col [firstclxcampaign_1125D] is date...
col [firstdatedue_489D] is date...
col [lastactivateddate_801D] is date...
col [lastapplicationdate_877D] is date...
col [lastapprdate_640D] is date...
col [lastdelinqdate_224D] is date...
col [lastrejectdate_50D] is date...
col [maxdpdinstldate_3546855D] is date...
col [validfrom_1069D] is date...


In [94]:
def handle_cross(df):
    res_dict = {}
    for col in df.columns:
        suffix = col[-1]
        if suffix in ['A', 'P', 'L', 'T']:
            prefix = col.split('_')[0]
            fea = prefix + '_' + suffix
            if fea not in res_dict:
                res_dict[fea] = [col]
            else:
                res_dict[fea].append(col)

    return [(k, v) for k, v in res_dict.items() if len(v)>1]

In [95]:
tmp_dict = handle_cross(data)

In [96]:
tmp_dict

[('applicationscnt_L',
  ['applicationscnt_1086L',
   'applicationscnt_464L',
   'applicationscnt_629L',
   'applicationscnt_867L']),
 ('clientscnt_L',
  ['clientscnt_100L',
   'clientscnt_1022L',
   'clientscnt_1071L',
   'clientscnt_1130L',
   'clientscnt_157L',
   'clientscnt_257L',
   'clientscnt_304L',
   'clientscnt_360L',
   'clientscnt_493L',
   'clientscnt_533L',
   'clientscnt_887L',
   'clientscnt_946L']),
 ('maxannuity_A', ['maxannuity_159A', 'maxannuity_4075009A'])]

In [106]:
def derived_features_between_cols(df, cols_list):
    for item in cols_list:
        col1 = item[0]
        col2 = item[1]

        df = df.with_columns((pl.col(col2) - pl.col(col1)).alias(col2 + '_minus_' + col1))
        df = df.with_columns(operator.truediv(pl.col(col2), pl.col(col1)).alias(col2 + '_div_' + col1))

    return df

In [116]:
data = derived_features_between_cols(data, [['maxannuity_159A', 'maxannuity_4075009A']])

In [None]:
# tmp.select('maxannuity_159A', 'maxannuity_4075009A', 'maxannuity_4075009A_minus_maxannuity_159A', 'maxannuity_4075009A_div_maxannuity_159A') \
#     .filter(pl.col('maxannuity_159A').is_not_null() , pl.col('maxannuity_4075009A').is_not_null())

In [121]:
data.head()

case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,…,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,validfrom_1069D,IS_TRAIN,lastapprcommoditycat_1041M_target_encoder,lastapprcommoditytypec_5251766M_target_encoder,lastcancelreason_561M_target_encoder,lastrejectcommoditycat_161M_target_encoder,lastrejectcommodtypec_5251769M_target_encoder,lastrejectreason_759M_target_encoder,lastrejectreasonclient_4145040M_target_encoder,previouscontdistrict_112M_target_encoder,maxannuity_4075009A_minus_maxannuity_159A,maxannuity_4075009A_div_maxannuity_159A
i64,date,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,bool,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,i64,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,2019-01-03,201901,0,0,,,1917.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,,"""OTHER""","""OTHER""",,,,,,24.0,0.0,0.0,,,0.0,0.0,,,0.0,0.0,,"""BO""",,,1,0.031076,0.031622,0.024471,0.029607,0.031503,0.02214,0.022528,0.029796,,
1,2019-01-03,201901,0,0,,,3134.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,…,0.0,0.0,0.0,,"""OTHER""","""OTHER""",,,,,,18.0,0.0,0.0,,,0.0,0.0,,,0.0,0.0,,"""BO""",,,1,0.031076,0.031622,0.024471,0.029607,0.031503,0.02214,0.022528,0.029796,,
2,2019-01-04,201901,0,0,,,4937.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,False,"""OTHER""","""OTHER""",,,,,,36.0,0.0,0.0,,,0.0,0.0,,,0.0,0.0,,"""BO""","""AL""",,1,0.031076,0.031622,0.024471,0.029607,0.031503,0.02214,0.022528,0.029796,,
3,2019-01-03,201901,0,0,,,4643.6,0.0,0.0,1.0,0.0,2.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,…,0.0,0.0,1.0,False,"""OTHER""","""OTHER""",,,,,,12.0,0.0,0.0,,,1.0,1.0,,,0.0,0.0,,"""BO""","""AL""",,1,0.031076,0.031622,0.054874,0.029607,0.031503,0.043615,0.022528,0.029796,,
4,2019-01-04,201901,0,1,,,3390.2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,False,"""OTHER""","""OTHER""",,,,,,24.0,0.0,0.0,,,0.0,0.0,,,0.0,0.0,,"""BO""","""AL""",,1,0.031076,0.031622,0.049017,0.029607,0.031503,0.02214,0.022528,0.029796,,


In [122]:
data = data.drop('date_decision', 'MONTH', 'WEEK_NUM', 'target')

In [125]:
data.shape

(1526669, 163)

In [126]:
preprocess_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/preprocess"
preprocess_file = 'static_0.parquet'
preprocess_file_path = Path(os.path.join(preprocess_dir_path, preprocess_file))
data.write_parquet(preprocess_file_path)