In [1]:
import warnings
import os
from glob import glob
from pathlib import Path
import operator

import pandas as pd
import polars as pl
import polars.selectors as cs
from sklearn import preprocessing
from category_encoders import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder

pd.set_option('display.float_format',lambda x : '%.6f' % x)
warnings.filterwarnings('ignore')

In [2]:
def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2", "target"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("M",):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))

    return df

In [3]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(set_table_dtypes)
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        chunks.append(pl.read_parquet(path).pipe(set_table_dtypes))

    df = pl.concat(chunks, how="vertical_relaxed")
    return df

In [23]:
train_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/parquet_files/train"
test_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/parquet_files/test"
train_file = 'train_applprev_1_*.parquet'
test_file = 'test_applprev_1_*.parquet'
train_file_path = Path(os.path.join(train_dir_path, train_file))
test_file_path = Path(os.path.join(test_dir_path, test_file))

target_file = 'train_base.parquet'
target_file_path = Path(os.path.join(train_dir_path, target_file))
target_file_test = 'test_base.parquet'
target_file_test_path = Path(os.path.join(test_dir_path, target_file_test))

In [24]:
df_base_train = read_file(target_file_path)
df_base_train = df_base_train.with_columns(IS_TRAIN = pl.lit(1))
df_train = read_files(train_file_path)
df_train = df_base_train.join(df_train, how="left", on="case_id")

In [25]:
df_base_test = read_file(target_file_test_path)
df_base_test = df_base_test.with_columns(target = None)
df_base_test = df_base_test.with_columns(IS_TRAIN = pl.lit(0))
df_test = read_files(test_file_path)
df_test = df_base_test.join(df_test, how="left", on="case_id")

In [26]:
data = pl.concat([df_train, df_test], how="vertical_relaxed")

In [27]:
data_base = pl.concat([df_base_train, df_base_test], how="vertical_relaxed")

In [28]:
del df_train, df_test

In [31]:
data_base.head()

case_id,date_decision,MONTH,WEEK_NUM,target,IS_TRAIN
i64,date,i64,i64,i64,i32
0,2019-01-03,201901,0,0,1
1,2019-01-03,201901,0,0,1
2,2019-01-04,201901,0,0,1
3,2019-01-03,201901,0,0,1
4,2019-01-04,201901,0,1,1


In [34]:
# df_train.filter(pl.col("case_id") == 57633)

case_id,date_decision,MONTH,WEEK_NUM,target,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,childnum_21L,creationdate_885D,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,credacc_status_367L,credacc_transactions_402L,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,education_1138M,employedfrom_700D,familystate_726L,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,isdebitcard_527L,mainoccupationinc_437A,maxdpdtolerance_577P,num_group1,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,revolvingaccount_394A,status_219L,tenor_203L,IS_TRAIN
i64,date,i64,i64,i64,f64,f64,date,f64,str,f64,date,f64,f64,f64,f64,str,f64,f64,str,f64,date,str,f64,date,date,str,date,str,date,str,bool,bool,f64,f64,i64,f64,f64,str,str,str,str,f64,str,f64,i32


In [32]:
class Aggregator:
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]

        expr_num = []
        expr_num.extend([pl.max(col).alias(f"max_{col}") for col in cols])
        expr_num.extend([pl.min(col).alias(f"min_{col}") for col in cols])
        expr_num.extend([pl.mean(col).alias(f"mean_{col}") for col in cols])
        # expr_num.extend([pl.count(col).alias(f"count_{col}") for col in cols])
        expr_num.extend([pl.var(col).alias(f"var_{col}") for col in cols])
        expr_num.extend([pl.last(col).alias(f"last_{col}") for col in cols])
        expr_num.extend([pl.first(col).alias(f"first_{col}") for col in cols])

        return expr_num

    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]

        expr_date = []
        expr_date.extend([pl.max(col).alias(f"max_{col}") for col in cols])
        expr_date.extend([pl.min(col).alias(f"min_{col}") for col in cols])

        return expr_date

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        cols.remove('WEEK_NUM')

        expr_str = []
        expr_str.extend([pl.last(col).alias(f"last_{col}") for col in cols])
        expr_str.extend([pl.first(col).alias(f"first_{col}") for col in cols])
        expr_str.extend([pl.n_unique(col).alias(f"n_unique_{col}") for col in cols])

        return expr_str

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]

        expr_other = []
        expr_other.extend([pl.max(col).alias(f"max_{col}") for col in cols])
        expr_other.extend([pl.min(col).alias(f"min_{col}") for col in cols])
        expr_other.extend([pl.mean(col).alias(f"mean_{col}") for col in cols])
        expr_other.extend([pl.var(col).alias(f"var_{col}") for col in cols])
        expr_other.extend([pl.last(col).alias(f"last_{col}") for col in cols])
        expr_other.extend([pl.first(col).alias(f"first_{col}") for col in cols])

        return expr_other

    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]

        expr_count = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_count

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [33]:
fea_agg = data.group_by("case_id").agg(Aggregator.get_exprs(data))

In [34]:
fea_agg.shape

(1526669, 187)

In [86]:
data = data_base.join(fea_agg, how="left", on="case_id")

In [87]:
data.shape

(1526669, 192)

In [88]:
def filter_cols(df):
    dt_train = df.filter(pl.col('IS_TRAIN') == 1)
    for col in df.columns:
        if col not in ["case_id", "WEEK_NUM", "date_decision", "MONTH", "IS_TRAIN", "target"]:
            isnull = dt_train[col].is_null().mean()

            if isnull > 0.95:
                print(f'col [{col}] to be null dropped...')
                num_positive = dt_train.filter(pl.col(col).is_not_null() & pl.col("target") == 1).shape[0]
                if num_positive < 1000:
                    df = df.drop(col)
                    print(f'col [{col}] null dropped...')

    for col in df.columns:
        if (col not in ["case_id", "WEEK_NUM", "date_decision", "MONTH", "IS_TRAIN", "target"]) & (df[col].dtype == pl.String):
            freq = dt_train[col].n_unique()

            if freq == 1:
                df = df.drop(col)
                print(f'col [{col}] freq dropped...')

    return df

In [89]:
data = data.pipe(filter_cols)

col [var_credacc_actualbalance_314A] to be null dropped...
col [var_credacc_maxhisbal_375A] to be null dropped...
col [var_credacc_minhisbal_90A] to be null dropped...
col [var_revolvingaccount_394A] to be null dropped...
col [var_revolvingaccount_394A] null dropped...
col [last_revolvingaccount_394A] to be null dropped...
col [last_revolvingaccount_394A] null dropped...
col [first_credacc_actualbalance_314A] to be null dropped...
col [first_credacc_maxhisbal_375A] to be null dropped...
col [first_credacc_minhisbal_90A] to be null dropped...
col [mean_credacc_status_367L] to be null dropped...
col [mean_credacc_status_367L] null dropped...
col [mean_credtype_587L] to be null dropped...
col [mean_credtype_587L] null dropped...
col [mean_familystate_726L] to be null dropped...
col [mean_familystate_726L] null dropped...
col [mean_inittransactioncode_279L] to be null dropped...
col [mean_inittransactioncode_279L] null dropped...
col [mean_status_219L] to be null dropped...
col [mean_statu

In [96]:
data.shape

(1526669, 180)

In [97]:
def handle_cross(df):
    res_dict = {}
    for col in df.columns:
        suffix = col[-1]
        if suffix in ['A', 'P'] or (suffix in ['L', 'T'] and (df[col].dtype in [pl.Int64, pl.Float64])):
            prefix = col.split('_')[1]
            fea = prefix + '_' + suffix
            if fea not in res_dict:
                res_dict[fea] = [col]
            else:
                res_dict[fea].append(col)

    return [(k, v) for k, v in res_dict.items() if len(v)>1]

In [98]:
tmp_dict = handle_cross(data)
tmp_dict

[('actualdpd_P',
  ['max_actualdpd_943P',
   'min_actualdpd_943P',
   'mean_actualdpd_943P',
   'var_actualdpd_943P',
   'last_actualdpd_943P',
   'first_actualdpd_943P']),
 ('annuity_A',
  ['max_annuity_853A',
   'min_annuity_853A',
   'mean_annuity_853A',
   'var_annuity_853A',
   'last_annuity_853A',
   'first_annuity_853A']),
 ('credacc_A',
  ['max_credacc_actualbalance_314A',
   'max_credacc_credlmt_575A',
   'max_credacc_maxhisbal_375A',
   'max_credacc_minhisbal_90A',
   'min_credacc_actualbalance_314A',
   'min_credacc_credlmt_575A',
   'min_credacc_maxhisbal_375A',
   'min_credacc_minhisbal_90A',
   'mean_credacc_actualbalance_314A',
   'mean_credacc_credlmt_575A',
   'mean_credacc_maxhisbal_375A',
   'mean_credacc_minhisbal_90A',
   'var_credacc_actualbalance_314A',
   'var_credacc_credlmt_575A',
   'var_credacc_maxhisbal_375A',
   'var_credacc_minhisbal_90A',
   'last_credacc_actualbalance_314A',
   'last_credacc_credlmt_575A',
   'last_credacc_maxhisbal_375A',
   'last_cred

In [99]:
derive_list = []
for kvs in tmp_dict:
    for item in kvs[1]:
        if item.startswith('max') and (item.replace('max', 'min') in kvs[1]):
            derive_list.append((item.replace('max', 'min'), item))
        elif item.startswith('last') and (item.replace('last', 'first') in kvs[1]):
            derive_list.append((item.replace('last', 'first'), item))
        else:
            pass

derive_list

[('min_actualdpd_943P', 'max_actualdpd_943P'),
 ('first_actualdpd_943P', 'last_actualdpd_943P'),
 ('min_annuity_853A', 'max_annuity_853A'),
 ('first_annuity_853A', 'last_annuity_853A'),
 ('min_credacc_actualbalance_314A', 'max_credacc_actualbalance_314A'),
 ('min_credacc_credlmt_575A', 'max_credacc_credlmt_575A'),
 ('min_credacc_minhisbal_90A', 'max_credacc_minhisbal_90A'),
 ('first_credacc_actualbalance_314A', 'last_credacc_actualbalance_314A'),
 ('first_credacc_credlmt_575A', 'last_credacc_credlmt_575A'),
 ('first_credacc_maxhisbal_375A', 'last_credacc_maxhisbal_375A'),
 ('first_credacc_minhisbal_90A', 'last_credacc_minhisbal_90A'),
 ('min_credamount_590A', 'max_credamount_590A'),
 ('first_credamount_590A', 'last_credamount_590A'),
 ('min_currdebt_94A', 'max_currdebt_94A'),
 ('first_currdebt_94A', 'last_currdebt_94A'),
 ('min_downpmt_134A', 'max_downpmt_134A'),
 ('first_downpmt_134A', 'last_downpmt_134A'),
 ('min_mainoccupationinc_437A', 'max_mainoccupationinc_437A'),
 ('first_mainoc

In [100]:
def derived_features_between_cols(df, cols_list):
    for item in cols_list:
        print(item)
        col1 = item[0]
        col2 = item[1]

        df = df.with_columns((pl.col(col2) - pl.col(col1)).alias(col2 + '_minus_' + col1))
        df = df.with_columns(operator.truediv(pl.col(col2), pl.col(col1)).alias(col2 + '_div_' + col1))

    return df

data = derived_features_between_cols(data, derive_list)

('min_actualdpd_943P', 'max_actualdpd_943P')
('first_actualdpd_943P', 'last_actualdpd_943P')
('min_annuity_853A', 'max_annuity_853A')
('first_annuity_853A', 'last_annuity_853A')
('min_credacc_actualbalance_314A', 'max_credacc_actualbalance_314A')
('min_credacc_credlmt_575A', 'max_credacc_credlmt_575A')
('min_credacc_minhisbal_90A', 'max_credacc_minhisbal_90A')
('first_credacc_actualbalance_314A', 'last_credacc_actualbalance_314A')
('first_credacc_credlmt_575A', 'last_credacc_credlmt_575A')
('first_credacc_maxhisbal_375A', 'last_credacc_maxhisbal_375A')
('first_credacc_minhisbal_90A', 'last_credacc_minhisbal_90A')
('min_credamount_590A', 'max_credamount_590A')
('first_credamount_590A', 'last_credamount_590A')
('min_currdebt_94A', 'max_currdebt_94A')
('first_currdebt_94A', 'last_currdebt_94A')
('min_downpmt_134A', 'max_downpmt_134A')
('first_downpmt_134A', 'last_downpmt_134A')
('min_mainoccupationinc_437A', 'max_mainoccupationinc_437A')
('first_mainoccupationinc_437A', 'last_mainoccupati

In [101]:
data.shape

(1526669, 246)

In [106]:
def handle_date(df):
    for col in df.columns:
        if col[-1] in ("D",) and df[col].dtype == pl.Date:
            print(f'col [{col}] is date...')
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days())

    # df = df.drop("date_decision", "MONTH")
    return df


def handle_category(df):
    dt_train = df.filter(pl.col('IS_TRAIN') == 1)
    cols_list = df.columns
    for col in cols_list:
        if ((col[-1] in ("M",)) and ('WEEK_NUM' not in col) and (not col.startswith('n_unique'))) \
                or (col[-1] in ("L", "P") and (df[col].dtype == pl.String)):
            freq = dt_train[col].n_unique()
            if freq <= 10:
                print(f'col [{col}] one hot encoded...')
                encoder = OneHotEncoder(cols=[col], handle_unknown='indicator', handle_missing='indicator', use_cat_names=True)
                encoder.fit(dt_train[col].to_pandas())
                res_df = pl.from_pandas(encoder.transform(df[col].to_pandas()))
                df = pl.concat([df, res_df], how="horizontal")
                df = df.drop(col)
            else:
                print(f'col [{col}] target encoded...')
                encoder = TargetEncoder(cols=[col], handle_unknown='value', handle_missing='value') \
                            .fit(dt_train[col].to_pandas(), dt_train['target'].to_pandas())
                df = df.with_columns(pl.from_pandas(encoder.transform(df[col].to_pandas())).to_series().alias(col + '_target_encoder'))
                df = df.drop(col)

    return df

In [103]:
data = data.pipe(handle_category)

col [last_cancelreason_3545846M] target encoded...
col [last_district_544M] target encoded...
col [last_education_1138M] one hot encoded...
col [last_postype_4733339M] one hot encoded...
col [last_profession_152M] target encoded...
col [last_rejectreason_755M] target encoded...
col [last_rejectreasonclient_4145042M] target encoded...
col [first_cancelreason_3545846M] target encoded...
col [first_district_544M] target encoded...
col [first_education_1138M] one hot encoded...
col [first_postype_4733339M] one hot encoded...
col [first_profession_152M] target encoded...
col [first_rejectreason_755M] target encoded...
col [first_rejectreasonclient_4145042M] target encoded...
col [max_credacc_status_367L] one hot encoded...
col [max_credtype_587L] one hot encoded...
col [max_familystate_726L] one hot encoded...
col [max_inittransactioncode_279L] one hot encoded...
col [max_status_219L] target encoded...
col [min_credacc_status_367L] one hot encoded...
col [min_credtype_587L] one hot encoded.

In [107]:
data = data.pipe(handle_date)

col [max_approvaldate_319D] is date...
col [max_creationdate_885D] is date...
col [max_dateactivated_425D] is date...
col [max_dtlastpmt_581D] is date...
col [max_dtlastpmtallstes_3545839D] is date...
col [max_employedfrom_700D] is date...
col [max_firstnonzeroinstldate_307D] is date...
col [min_approvaldate_319D] is date...
col [min_creationdate_885D] is date...
col [min_dateactivated_425D] is date...
col [min_dtlastpmt_581D] is date...
col [min_dtlastpmtallstes_3545839D] is date...
col [min_employedfrom_700D] is date...
col [min_firstnonzeroinstldate_307D] is date...


In [None]:
# data.select('max_familystate_726L',).filter(pl.col('max_familystate_726L').is_not_null())

In [112]:
data.head(10)

case_id,IS_TRAIN,max_actualdpd_943P,max_annuity_853A,max_credacc_actualbalance_314A,max_credacc_credlmt_575A,max_credacc_maxhisbal_375A,max_credacc_minhisbal_90A,max_credamount_590A,max_currdebt_94A,max_downpmt_134A,max_mainoccupationinc_437A,max_maxdpdtolerance_577P,max_outstandingdebt_522A,max_revolvingaccount_394A,min_actualdpd_943P,min_annuity_853A,min_credacc_actualbalance_314A,min_credacc_credlmt_575A,min_credacc_maxhisbal_375A,min_credacc_minhisbal_90A,min_credamount_590A,min_currdebt_94A,min_downpmt_134A,min_mainoccupationinc_437A,min_maxdpdtolerance_577P,min_outstandingdebt_522A,min_revolvingaccount_394A,mean_actualdpd_943P,mean_annuity_853A,mean_credacc_actualbalance_314A,mean_credacc_credlmt_575A,mean_credacc_maxhisbal_375A,mean_credacc_minhisbal_90A,mean_credamount_590A,mean_currdebt_94A,mean_downpmt_134A,…,last_familystate_726L_MARRIED,last_familystate_726L_DIVORCED,last_familystate_726L_WIDOWED,last_familystate_726L_LIVING_WITH_PARTNER,last_familystate_726L_-1,last_inittransactioncode_279L_nan,last_inittransactioncode_279L_CASH,last_inittransactioncode_279L_POS,last_inittransactioncode_279L_NDF,last_inittransactioncode_279L_-1,last_status_219L_target_encoder,first_credacc_status_367L_nan,first_credacc_status_367L_CL,first_credacc_status_367L_AC,first_credacc_status_367L_CA,first_credacc_status_367L_CR,first_credacc_status_367L_PO,first_credacc_status_367L_PCL,first_credacc_status_367L_-1,first_credtype_587L_nan,first_credtype_587L_CAL,first_credtype_587L_COL,first_credtype_587L_REL,first_credtype_587L_-1,first_familystate_726L_nan,first_familystate_726L_SINGLE,first_familystate_726L_MARRIED,first_familystate_726L_WIDOWED,first_familystate_726L_DIVORCED,first_familystate_726L_LIVING_WITH_PARTNER,first_familystate_726L_-1,first_inittransactioncode_279L_nan,first_inittransactioncode_279L_CASH,first_inittransactioncode_279L_POS,first_inittransactioncode_279L_NDF,first_inittransactioncode_279L_-1,first_status_219L_target_encoder
i64,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64
0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,0,0,0,0,0,1,0,0,0,0,0.022015,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0.022013
1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,0,0,0,0,0,1,0,0,0,0,0.022015,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0.022013
2,1,0.0,1682.4,,0.0,,,16000.0,,0.0,8200.0,,,,0.0,640.2,,0.0,,,10000.0,,0.0,8200.0,,,,0.0,1161.3,,0.0,,,13000.0,,0.0,…,0,0,0,0,0,0,1,0,0,0,0.045342,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0.051011
3,1,0.0,6140.0,,0.0,,,59999.8,,0.0,11000.0,,,,0.0,6140.0,,0.0,,,59999.8,,0.0,11000.0,,,,0.0,6140.0,,0.0,,,59999.8,,0.0,…,1,0,0,0,0,0,1,0,0,0,0.045342,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0.051011
4,1,0.0,2556.6,,0.0,,,40000.0,,0.0,16000.0,,,,0.0,2556.6,,0.0,,,40000.0,,0.0,16000.0,,,,0.0,2556.6,,0.0,,,40000.0,,0.0,…,0,0,0,0,0,0,1,0,0,0,0.029584,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0.028031
5,1,0.0,,,,,,,,,62000.0,,,,0.0,,,,,,,,,62000.0,,,,0.0,,,,,,,,,…,0,0,0,0,0,1,0,0,0,0,0.029584,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0.028031
6,1,0.0,4189.6,,0.0,,,32000.0,0.0,0.0,37000.0,,0.0,,0.0,1110.4,,0.0,,,15980.0,0.0,0.0,14000.0,,0.0,,0.0,2357.933333,,0.0,,,21786.666667,0.0,0.0,…,0,0,0,0,0,0,0,1,0,0,0.045342,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0.051011
7,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,0,0,0,0,0,1,0,0,0,0,0.022015,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0.022013
8,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,0,0,0,0,0,1,0,0,0,0,0.022015,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0.022013
9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,0,0,0,0,0,1,0,0,0,0,0.022015,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0.022013


In [110]:
data = data.drop('date_decision', 'MONTH', 'WEEK_NUM', 'target')

In [111]:
data.shape

(1526669, 370)

In [113]:
preprocess_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/preprocess"
preprocess_file = 'applprev_1.parquet'
preprocess_file_path = Path(os.path.join(preprocess_dir_path, preprocess_file))
data.write_parquet(preprocess_file_path)