In [1]:
import warnings
import os
from glob import glob
from pathlib import Path
import operator

import pandas as pd
import polars as pl
import polars.selectors as cs
from sklearn import preprocessing
from category_encoders import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder

pd.set_option('display.float_format',lambda x : '%.6f' % x)
warnings.filterwarnings('ignore')

In [2]:
def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2", "target"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("M",):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))

    return df

In [3]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(set_table_dtypes)
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        chunks.append(pl.read_parquet(path).pipe(set_table_dtypes))

    df = pl.concat(chunks, how="vertical_relaxed")
    return df

In [4]:
train_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/parquet_files/train"
test_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/parquet_files/test"
train_file = 'train_applprev_2.parquet'
test_file = 'test_applprev_2.parquet'
train_file_path = Path(os.path.join(train_dir_path, train_file))
test_file_path = Path(os.path.join(test_dir_path, test_file))

target_file = 'train_base.parquet'
target_file_path = Path(os.path.join(train_dir_path, target_file))
target_file_test = 'test_base.parquet'
target_file_test_path = Path(os.path.join(test_dir_path, target_file_test))

In [5]:
df_base_train = read_file(target_file_path)
df_base_train = df_base_train.with_columns(IS_TRAIN = pl.lit(1))
df_train = read_file(train_file_path)
df_train = df_base_train.join(df_train, how="left", on="case_id")

In [6]:
df_base_test = read_file(target_file_test_path)
df_base_test = df_base_test.with_columns(target = None)
df_base_test = df_base_test.with_columns(IS_TRAIN = pl.lit(0))
df_test = read_file(test_file_path)
df_test = df_base_test.join(df_test, how="left", on="case_id")

In [7]:
data = pl.concat([df_train, df_test], how="vertical_relaxed")

In [8]:
data_base = pl.concat([df_base_train, df_base_test], how="vertical_relaxed")

In [28]:
del df_train, df_test

In [9]:
data.shape

(14380640, 11)

In [17]:
data.head(100)

case_id,date_decision,MONTH,WEEK_NUM,target,IS_TRAIN,cacccardblochreas_147M,conts_type_509L,credacc_cards_status_52L,num_group1,num_group2
i64,date,i64,i64,i64,i32,str,str,str,i64,i64
0,2019-01-03,201901,0,0,1,,,,,
1,2019-01-03,201901,0,0,1,,,,,
2,2019-01-04,201901,0,0,1,,"""PRIMARY_MOBILE…",,0,0
2,2019-01-04,201901,0,0,1,,"""EMPLOYMENT_PHO…",,0,1
2,2019-01-04,201901,0,0,1,,"""PRIMARY_MOBILE…",,1,0
2,2019-01-04,201901,0,0,1,,"""EMPLOYMENT_PHO…",,1,1
3,2019-01-03,201901,0,0,1,,"""PHONE""",,0,0
3,2019-01-03,201901,0,0,1,,"""PRIMARY_MOBILE…",,0,1
3,2019-01-03,201901,0,0,1,,"""PRIMARY_EMAIL""",,0,2
4,2019-01-04,201901,0,1,1,,"""PRIMARY_MOBILE…",,0,0


In [11]:
data['case_id'].n_unique()

1526669

In [15]:
# df_train.filter(pl.col("case_id") == 57633)
# data.select(['case_id', 'num_group1', 'target']).filter(pl.col('num_group1').is_not_null())
# data.filter(pl.col('cacccardblochreas_147M').is_not_null())
data['cacccardblochreas_147M'].n_unique()
# df_train.shape

10

In [16]:
data = data.sort('case_id', 'num_group1', 'num_group2')

In [18]:
class Aggregator:
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]

        expr_num = []
        # expr_num.extend([pl.max(col).alias(f"max_{col}") for col in cols])
        # expr_num.extend([pl.min(col).alias(f"min_{col}") for col in cols])
        # expr_num.extend([pl.mean(col).alias(f"mean_{col}") for col in cols])
        # expr_num.extend([pl.count(col).alias(f"count_{col}") for col in cols])
        expr_num.extend([pl.var(col).alias(f"var_{col}") for col in cols])
        expr_num.extend([pl.last(col).alias(f"last_{col}") for col in cols])
        expr_num.extend([pl.first(col).alias(f"first_{col}") for col in cols])

        return expr_num

    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]

        expr_date = []
        expr_date.extend([pl.max(col).alias(f"max_{col}") for col in cols])
        expr_date.extend([pl.min(col).alias(f"min_{col}") for col in cols])

        return expr_date

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        cols.remove('WEEK_NUM')

        expr_str = []
        expr_str.extend([pl.last(col).alias(f"last_{col}") for col in cols])
        expr_str.extend([pl.first(col).alias(f"first_{col}") for col in cols])
        expr_str.extend([pl.n_unique(col).alias(f"n_unique_{col}") for col in cols])

        return expr_str

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_other = []
        for col in cols:
            if df[col].dtype == pl.String or df[col].dtype == pl.Boolean:
                expr_other.append(pl.last(col).alias(f"last_{col}"))
                expr_other.append(pl.first(col).alias(f"first_{col}"))
                expr_other.append(pl.n_unique(col).alias(f"n_unique_{col}"))
            elif df[col].dtype == pl.Int64 or df[col].dtype == pl.Float64:
                expr_other.append(pl.mean(col).alias(f"mean_{col}"))
                expr_other.append(pl.var(col).alias(f"var_{col}"))
                expr_other.append(pl.last(col).alias(f"last_{col}"))
                expr_other.append(pl.first(col).alias(f"first_{col}"))

        return expr_other

    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]

        expr_count = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_count

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [19]:
fea_agg = data.group_by("case_id").agg(Aggregator.get_exprs(data))

In [20]:
fea_agg.shape

(1526669, 12)

In [21]:
fea_agg.head(20)

case_id,last_cacccardblochreas_147M,first_cacccardblochreas_147M,n_unique_cacccardblochreas_147M,last_conts_type_509L,first_conts_type_509L,n_unique_conts_type_509L,last_credacc_cards_status_52L,first_credacc_cards_status_52L,n_unique_credacc_cards_status_52L,max_num_group1,max_num_group2
i64,str,str,u32,str,str,u32,str,str,u32,i64,i64
0,,,1,,,1,,,1,,
1,,,1,,,1,,,1,,
2,,,1,"""EMPLOYMENT_PHO…","""PRIMARY_MOBILE…",2,,,1,1.0,1.0
3,,,1,"""PRIMARY_EMAIL""","""PHONE""",3,,,1,0.0,2.0
4,,,1,"""HOME_PHONE""","""PRIMARY_MOBILE…",2,,,1,0.0,1.0
5,,,1,"""PRIMARY_MOBILE…","""PRIMARY_MOBILE…",1,,,1,0.0,0.0
6,,,1,"""EMPLOYMENT_PHO…","""PRIMARY_MOBILE…",3,,,1,2.0,1.0
7,,,1,,,1,,,1,,
8,,,1,,,1,,,1,,
9,,,1,,,1,,,1,,


In [22]:
data = data_base.join(fea_agg, how="left", on="case_id")

In [23]:
data.head(10)

case_id,date_decision,MONTH,WEEK_NUM,target,IS_TRAIN,last_cacccardblochreas_147M,first_cacccardblochreas_147M,n_unique_cacccardblochreas_147M,last_conts_type_509L,first_conts_type_509L,n_unique_conts_type_509L,last_credacc_cards_status_52L,first_credacc_cards_status_52L,n_unique_credacc_cards_status_52L,max_num_group1,max_num_group2
i64,date,i64,i64,i64,i32,str,str,u32,str,str,u32,str,str,u32,i64,i64
0,2019-01-03,201901,0,0,1,,,1,,,1,,,1,,
1,2019-01-03,201901,0,0,1,,,1,,,1,,,1,,
2,2019-01-04,201901,0,0,1,,,1,"""EMPLOYMENT_PHO…","""PRIMARY_MOBILE…",2,,,1,1.0,1.0
3,2019-01-03,201901,0,0,1,,,1,"""PRIMARY_EMAIL""","""PHONE""",3,,,1,0.0,2.0
4,2019-01-04,201901,0,1,1,,,1,"""HOME_PHONE""","""PRIMARY_MOBILE…",2,,,1,0.0,1.0
5,2019-01-02,201901,0,0,1,,,1,"""PRIMARY_MOBILE…","""PRIMARY_MOBILE…",1,,,1,0.0,0.0
6,2019-01-03,201901,0,0,1,,,1,"""EMPLOYMENT_PHO…","""PRIMARY_MOBILE…",3,,,1,2.0,1.0
7,2019-01-03,201901,0,0,1,,,1,,,1,,,1,,
8,2019-01-03,201901,0,0,1,,,1,,,1,,,1,,
9,2019-01-03,201901,0,0,1,,,1,,,1,,,1,,


In [None]:
# def derived_features_between_cols(df, cols_list):
#     for item in cols_list:
#         print(item)
#         col1 = item[0]
#         col2 = item[1]
#
#         df = df.with_columns((pl.col(col2) - pl.col(col1)).alias(col2 + '_minus_' + col1))
#         df = df.with_columns(operator.truediv(pl.col(col2), pl.col(col1)).alias(col2 + '_div_' + col1))
#
#     return df
#
# data = derived_features_between_cols(data, derive_list)

In [24]:
def handle_date(df):
    for col in df.columns:
        if col[-1] in ("D",) and df[col].dtype == pl.Date:
            print(f'col [{col}] is date...')
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days())

    # df = df.drop("date_decision", "MONTH")
    return df


def handle_category(df):
    dt_train = df.filter(pl.col('IS_TRAIN') == 1)
    cols_list = df.columns
    for col in cols_list:
        if ((col[-1] in ("M",)) and ('WEEK_NUM' not in col) and (not col.startswith('n_unique'))) \
                or (col[-1] in ("L", "P") and (df[col].dtype == pl.String) or df[col].dtype == pl.Boolean):
            freq = dt_train[col].n_unique()
            if freq <= 10:
                print(f'col [{col}] one hot encoded...')
                encoder = OneHotEncoder(cols=[col], handle_unknown='indicator', handle_missing='indicator', use_cat_names=True)
                encoder.fit(dt_train[col].to_pandas())
                res_df = pl.from_pandas(encoder.transform(df[col].to_pandas()))
                df = pl.concat([df, res_df], how="horizontal")
                df = df.drop(col)
            else:
                print(f'col [{col}] target encoded...')
                encoder = TargetEncoder(cols=[col], handle_unknown='value', handle_missing='value') \
                            .fit(dt_train[col].to_pandas(), dt_train['target'].to_pandas())
                df = df.with_columns(pl.from_pandas(encoder.transform(df[col].to_pandas())).to_series().alias(col + '_target_encoder'))
                df = df.drop(col)

    return df

In [25]:
data = data.pipe(handle_category)

col [last_cacccardblochreas_147M] one hot encoded...
col [first_cacccardblochreas_147M] one hot encoded...
col [last_conts_type_509L] one hot encoded...
col [first_conts_type_509L] one hot encoded...
col [last_credacc_cards_status_52L] one hot encoded...
col [first_credacc_cards_status_52L] one hot encoded...


In [26]:
data = data.pipe(handle_date)

In [27]:
data.shape

(1526669, 67)

In [28]:
def filter_cols(df):
    dt_train = df.filter(pl.col('IS_TRAIN') == 1)
    for col in df.columns:
        if col not in ["case_id", "WEEK_NUM", "date_decision", "MONTH", "IS_TRAIN", "target"]:
            isnull = dt_train[col].is_null().mean()

            if isnull > 0.95:
                print(f'col [{col}] to be null dropped...')
                num_positive = dt_train.filter(pl.col(col).is_not_null() & pl.col("target") == 1).shape[0]
                if num_positive < 1000:
                    df = df.drop(col)
                    print(f'col [{col}] null dropped...')

    for col in df.columns:
        if (col not in ["case_id", "WEEK_NUM", "date_decision", "MONTH", "IS_TRAIN", "target"]) & (df[col].dtype == pl.String):
            freq = dt_train[col].n_unique()

            if freq == 1:
                df = df.drop(col)
                print(f'col [{col}] freq dropped...')

    return df

In [29]:
data = data.pipe(filter_cols)

In [30]:
data.head(10)

case_id,date_decision,MONTH,WEEK_NUM,target,IS_TRAIN,n_unique_cacccardblochreas_147M,n_unique_conts_type_509L,n_unique_credacc_cards_status_52L,max_num_group1,max_num_group2,last_cacccardblochreas_147M_nan,last_cacccardblochreas_147M_a55475b1,last_cacccardblochreas_147M_P33_145_161,last_cacccardblochreas_147M_P201_63_60,last_cacccardblochreas_147M_P19_60_110,last_cacccardblochreas_147M_P133_119_56,last_cacccardblochreas_147M_P17_56_144,last_cacccardblochreas_147M_P41_107_150,last_cacccardblochreas_147M_-1,first_cacccardblochreas_147M_nan,first_cacccardblochreas_147M_a55475b1,first_cacccardblochreas_147M_P201_63_60,first_cacccardblochreas_147M_P19_60_110,first_cacccardblochreas_147M_P33_145_161,first_cacccardblochreas_147M_P133_119_56,first_cacccardblochreas_147M_P23_105_103,first_cacccardblochreas_147M_P17_56_144,first_cacccardblochreas_147M_P41_107_150,first_cacccardblochreas_147M_P127_74_114,first_cacccardblochreas_147M_-1,last_conts_type_509L_nan,last_conts_type_509L_EMPLOYMENT_PHONE,last_conts_type_509L_PRIMARY_EMAIL,last_conts_type_509L_HOME_PHONE,last_conts_type_509L_PRIMARY_MOBILE,last_conts_type_509L_PHONE,last_conts_type_509L_ALTERNATIVE_PHONE,last_conts_type_509L_WHATSAPP,last_conts_type_509L_SECONDARY_MOBILE,last_conts_type_509L_-1,first_conts_type_509L_nan,first_conts_type_509L_PRIMARY_MOBILE,first_conts_type_509L_PHONE,first_conts_type_509L_EMPLOYMENT_PHONE,first_conts_type_509L_HOME_PHONE,first_conts_type_509L_SECONDARY_MOBILE,first_conts_type_509L_PRIMARY_EMAIL,first_conts_type_509L_ALTERNATIVE_PHONE,first_conts_type_509L_WHATSAPP,first_conts_type_509L_-1,last_credacc_cards_status_52L_nan,last_credacc_cards_status_52L_CANCELLED,last_credacc_cards_status_52L_INACTIVE,last_credacc_cards_status_52L_ACTIVE,last_credacc_cards_status_52L_RENEWED,last_credacc_cards_status_52L_BLOCKED,last_credacc_cards_status_52L_UNCONFIRMED,last_credacc_cards_status_52L_-1,first_credacc_cards_status_52L_nan,first_credacc_cards_status_52L_CANCELLED,first_credacc_cards_status_52L_ACTIVE,first_credacc_cards_status_52L_INACTIVE,first_credacc_cards_status_52L_BLOCKED,first_credacc_cards_status_52L_RENEWED,first_credacc_cards_status_52L_UNCONFIRMED,first_credacc_cards_status_52L_-1
i64,date,i64,i64,i64,i32,u32,u32,u32,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,2019-01-03,201901,0,0,1,1,1,1,,,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,2019-01-03,201901,0,0,1,1,1,1,,,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,2019-01-04,201901,0,0,1,1,2,1,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,2019-01-03,201901,0,0,1,1,3,1,0.0,2.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,2019-01-04,201901,0,1,1,1,2,1,0.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,2019-01-02,201901,0,0,1,1,1,1,0.0,0.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
6,2019-01-03,201901,0,0,1,1,3,1,2.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7,2019-01-03,201901,0,0,1,1,1,1,,,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8,2019-01-03,201901,0,0,1,1,1,1,,,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,2019-01-03,201901,0,0,1,1,1,1,,,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [31]:
data = data.drop('date_decision', 'MONTH', 'WEEK_NUM', 'target')

In [32]:
data.shape

(1526669, 63)

In [33]:
preprocess_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/preprocess"
preprocess_file = 'applprev_2.parquet'
preprocess_file_path = Path(os.path.join(preprocess_dir_path, preprocess_file))
data.write_parquet(preprocess_file_path)