In [1]:
import warnings
import os
from glob import glob
from pathlib import Path
import operator

import pandas as pd
import polars as pl
import polars.selectors as cs
from sklearn import preprocessing
from category_encoders import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder

pd.set_option('display.float_format',lambda x : '%.6f' % x)
warnings.filterwarnings('ignore')

In [2]:
def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2", "target"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("M",):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))

    return df

In [3]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(set_table_dtypes)
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        chunks.append(pl.read_parquet(path).pipe(set_table_dtypes))

    df = pl.concat(chunks, how="vertical_relaxed")
    return df

In [4]:
train_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/parquet_files/train"
test_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/parquet_files/test"
train_file = 'train_person_1.parquet'
test_file = 'test_person_1.parquet'
train_file_path = Path(os.path.join(train_dir_path, train_file))
test_file_path = Path(os.path.join(test_dir_path, test_file))

target_file = 'train_base.parquet'
target_file_path = Path(os.path.join(train_dir_path, target_file))
target_file_test = 'test_base.parquet'
target_file_test_path = Path(os.path.join(test_dir_path, target_file_test))

In [5]:
df_base_train = read_file(target_file_path)
df_base_train = df_base_train.with_columns(IS_TRAIN = pl.lit(1))
df_train = read_file(train_file_path)
df_train = df_base_train.join(df_train, how="left", on="case_id")

In [6]:
df_base_test = read_file(target_file_test_path)
df_base_test = df_base_test.with_columns(target = None)
df_base_test = df_base_test.with_columns(IS_TRAIN = pl.lit(0))
df_test = read_file(test_file_path)
df_test = df_base_test.join(df_test, how="left", on="case_id")

In [7]:
data = pl.concat([df_train, df_test], how="vertical_relaxed")

In [8]:
data_base = pl.concat([df_base_train, df_base_test], how="vertical_relaxed")

In [28]:
del df_train, df_test

In [9]:
data.shape

(2974008, 42)

In [14]:
data.head(100)

case_id,date_decision,MONTH,WEEK_NUM,target,IS_TRAIN,birth_259D,birthdate_87D,childnum_185L,contaddr_district_15M,contaddr_matchlist_1032L,contaddr_smempladdr_334L,contaddr_zipcode_807M,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,empladdr_district_926M,empladdr_zipcode_114M,familystate_447L,gender_992L,housetype_905L,housingtype_772L,incometype_1044T,isreference_387L,language1_981M,mainoccupationinc_384A,maritalst_703L,num_group1,personindex_1023L,persontype_1072L,persontype_792L,registaddr_district_1083M,registaddr_zipcode_184M,relationshiptoclient_415T,relationshiptoclient_642T,remitter_829L,role_1084L,role_993L,safeguarantyflag_411L,sex_738L,type_25L
i64,date,i64,i64,i64,i32,date,date,f64,str,bool,bool,str,str,date,str,str,str,str,str,str,str,str,str,bool,str,f64,str,i64,f64,f64,f64,str,str,str,str,bool,str,str,bool,str,str
0,2019-01-03,201901,0,0,1,1986-07-01,,,"""P88_18_84""",false,false,"""P167_100_165""","""P97_36_170""",2017-09-15,"""MORE_FIVE""","""OTHER""","""P142_57_166""","""P167_100_165""","""MARRIED""",,,,"""SALARIED_GOVT""",,"""P10_39_147""",10800.0,,0,0.0,1.0,1.0,"""P88_18_84""","""P167_100_165""",,,,"""CL""",,true,"""F""","""PRIMARY_MOBILE…"
0,2019-01-03,201901,0,0,1,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,1,1.0,1.0,4.0,"""a55475b1""","""a55475b1""","""SPOUSE""",,false,"""EM""",,,,"""PHONE"""
0,2019-01-03,201901,0,0,1,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,2,2.0,4.0,5.0,"""a55475b1""","""a55475b1""","""COLLEAGUE""","""SPOUSE""",false,"""PE""",,,,"""PHONE"""
0,2019-01-03,201901,0,0,1,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,3,,5.0,,"""a55475b1""","""a55475b1""",,"""COLLEAGUE""",,"""PE""",,,,"""PHONE"""
1,2019-01-03,201901,0,0,1,1957-08-01,,,"""P103_93_94""",false,false,"""P176_37_166""","""P97_36_170""",2008-10-29,"""MORE_FIVE""","""OTHER""","""P49_46_174""","""P160_59_140""","""DIVORCED""",,,,"""SALARIED_GOVT""",,"""P10_39_147""",10000.0,,0,0.0,1.0,1.0,"""P103_93_94""","""P176_37_166""",,,,"""CL""",,true,"""M""","""PRIMARY_MOBILE…"
1,2019-01-03,201901,0,0,1,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,1,1.0,1.0,5.0,"""a55475b1""","""a55475b1""","""SIBLING""",,false,"""CL""",,,,"""PRIMARY_EMAIL"""
1,2019-01-03,201901,0,0,1,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,2,2.0,1.0,5.0,"""a55475b1""","""a55475b1""","""OTHER_RELATIVE…",,false,"""EM""",,,,"""PHONE"""
1,2019-01-03,201901,0,0,1,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,3,,5.0,,"""a55475b1""","""a55475b1""",,"""SIBLING""",,"""PE""",,,,"""PHONE"""
1,2019-01-03,201901,0,0,1,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,4,,5.0,,"""a55475b1""","""a55475b1""",,"""OTHER_RELATIVE…",,"""PE""",,,,"""PHONE"""
2,2019-01-04,201901,0,0,1,1974-12-01,,,"""P91_125_184""",false,false,"""P161_5_97""","""P97_36_170""",2010-02-15,"""MORE_FIVE""","""OTHER""","""P63_14_24""","""P161_5_97""","""MARRIED""",,,,"""EMPLOYED""",,"""P10_39_147""",14000.0,,0,0.0,1.0,1.0,"""P91_125_184""","""P161_5_97""",,,,"""EM""",,true,"""F""","""PRIMARY_MOBILE…"


In [11]:
data['case_id'].n_unique()

1526669

In [13]:
# df_train.filter(pl.col("case_id") == 57633)
# data.select(['case_id', 'num_group1', 'target']).filter(pl.col('num_group1').is_not_null())
data.filter(pl.col('num_group1').is_null()).shape
# df_train.shape

(7, 42)

In [15]:
data = data.sort('case_id', 'num_group1')

In [18]:
class Aggregator:
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]

        expr_num = []
        # expr_num.extend([pl.max(col).alias(f"max_{col}") for col in cols])
        # expr_num.extend([pl.min(col).alias(f"min_{col}") for col in cols])
        # expr_num.extend([pl.mean(col).alias(f"mean_{col}") for col in cols])
        # expr_num.extend([pl.count(col).alias(f"count_{col}") for col in cols])
        expr_num.extend([pl.var(col).alias(f"var_{col}") for col in cols])
        expr_num.extend([pl.last(col).alias(f"last_{col}") for col in cols])
        expr_num.extend([pl.first(col).alias(f"first_{col}") for col in cols])

        return expr_num

    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]

        expr_date = []
        expr_date.extend([pl.max(col).alias(f"max_{col}") for col in cols])
        expr_date.extend([pl.min(col).alias(f"min_{col}") for col in cols])

        return expr_date

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        cols.remove('WEEK_NUM')

        expr_str = []
        expr_str.extend([pl.last(col).alias(f"last_{col}") for col in cols])
        expr_str.extend([pl.first(col).alias(f"first_{col}") for col in cols])
        expr_str.extend([pl.n_unique(col).alias(f"n_unique_{col}") for col in cols])

        return expr_str

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_other = []
        for col in cols:
            if df[col].dtype == pl.String or df[col].dtype == pl.Boolean:
                expr_other.append(pl.last(col).alias(f"last_{col}"))
                expr_other.append(pl.first(col).alias(f"first_{col}"))
                expr_other.append(pl.n_unique(col).alias(f"n_unique_{col}"))
            elif df[col].dtype == pl.Int64 or df[col].dtype == pl.Float64:
                expr_other.append(pl.mean(col).alias(f"mean_{col}"))
                expr_other.append(pl.var(col).alias(f"var_{col}"))
                expr_other.append(pl.last(col).alias(f"last_{col}"))
                expr_other.append(pl.first(col).alias(f"first_{col}"))

        return expr_other

    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]

        expr_count = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_count

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [19]:
fea_agg = data.group_by("case_id").agg(Aggregator.get_exprs(data))

In [20]:
fea_agg.shape

(1526669, 108)

In [21]:
fea_agg.head(20)

case_id,var_mainoccupationinc_384A,last_mainoccupationinc_384A,first_mainoccupationinc_384A,max_birth_259D,max_birthdate_87D,max_empl_employedfrom_271D,min_birth_259D,min_birthdate_87D,min_empl_employedfrom_271D,last_contaddr_district_15M,last_contaddr_zipcode_807M,last_education_927M,last_empladdr_district_926M,last_empladdr_zipcode_114M,last_language1_981M,last_registaddr_district_1083M,last_registaddr_zipcode_184M,first_contaddr_district_15M,first_contaddr_zipcode_807M,first_education_927M,first_empladdr_district_926M,first_empladdr_zipcode_114M,first_language1_981M,first_registaddr_district_1083M,first_registaddr_zipcode_184M,n_unique_contaddr_district_15M,n_unique_contaddr_zipcode_807M,n_unique_education_927M,n_unique_empladdr_district_926M,n_unique_empladdr_zipcode_114M,n_unique_language1_981M,n_unique_registaddr_district_1083M,n_unique_registaddr_zipcode_184M,mean_childnum_185L,var_childnum_185L,last_childnum_185L,…,mean_personindex_1023L,var_personindex_1023L,last_personindex_1023L,first_personindex_1023L,mean_persontype_1072L,var_persontype_1072L,last_persontype_1072L,first_persontype_1072L,mean_persontype_792L,var_persontype_792L,last_persontype_792L,first_persontype_792L,last_relationshiptoclient_415T,first_relationshiptoclient_415T,n_unique_relationshiptoclient_415T,last_relationshiptoclient_642T,first_relationshiptoclient_642T,n_unique_relationshiptoclient_642T,last_remitter_829L,first_remitter_829L,n_unique_remitter_829L,last_role_1084L,first_role_1084L,n_unique_role_1084L,last_role_993L,first_role_993L,n_unique_role_993L,last_safeguarantyflag_411L,first_safeguarantyflag_411L,n_unique_safeguarantyflag_411L,last_sex_738L,first_sex_738L,n_unique_sex_738L,last_type_25L,first_type_25L,n_unique_type_25L,max_num_group1
i64,f64,f64,f64,date,date,date,date,date,date,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,u32,str,str,u32,bool,bool,u32,str,str,u32,str,str,u32,bool,bool,u32,str,str,u32,str,str,u32,i64
0,,,10800.0,1986-07-01,,2017-09-15,1986-07-01,,2017-09-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P88_18_84""","""P167_100_165""","""P97_36_170""","""P142_57_166""","""P167_100_165""","""P10_39_147""","""P88_18_84""","""P167_100_165""",2,2,2,2,2,2,2,2,,,,…,1.0,1.0,,0.0,2.75,4.25,5.0,1.0,3.333333,4.333333,,1.0,,,3,"""COLLEAGUE""",,3,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
1,,,10000.0,1957-08-01,,2008-10-29,1957-08-01,,2008-10-29,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P103_93_94""","""P176_37_166""","""P97_36_170""","""P49_46_174""","""P160_59_140""","""P10_39_147""","""P103_93_94""","""P176_37_166""",2,2,2,2,2,2,2,2,,,,…,1.0,1.0,,0.0,2.6,4.8,5.0,1.0,3.666667,5.333333,,1.0,,,3,"""OTHER_RELATIVE…",,3,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""M""",2,"""PHONE""","""PRIMARY_MOBILE…",3,4
2,,,14000.0,1974-12-01,,2010-02-15,1974-12-01,,2010-02-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P91_125_184""","""P161_5_97""","""P97_36_170""","""P63_14_24""","""P161_5_97""","""P10_39_147""","""P91_125_184""","""P161_5_97""",2,2,2,2,2,2,2,2,,,,…,1.0,1.0,,0.0,2.4,3.8,4.0,1.0,3.333333,4.333333,,1.0,,,3,"""SPOUSE""",,3,,,2,"""PE""","""EM""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",3,4
3,,,10000.0,1993-08-01,,2018-05-15,1993-08-01,,2018-05-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P155_139_77""","""P161_14_174""","""P33_146_175""","""P131_33_167""","""P161_14_174""","""P10_39_147""","""P155_139_77""","""P161_14_174""",2,2,2,2,2,2,2,2,,,,…,0.5,0.5,,0.0,2.0,3.0,4.0,1.0,2.5,4.5,,1.0,,,2,"""SPOUSE""",,2,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,2
4,,,24000.0,1994-01-01,,2014-12-15,1994-01-01,,2014-12-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P194_82_174""","""P8_88_79""","""P33_146_175""","""P62_144_102""","""P8_88_79""","""P10_39_147""","""P194_82_174""","""P8_88_79""",2,2,2,2,2,2,2,2,,,,…,1.0,1.0,,0.0,3.0,5.333333,5.0,1.0,3.666667,5.333333,,1.0,,,3,"""SIBLING""",,3,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
5,,,64000.0,1979-10-01,,2016-01-15,1979-10-01,,2016-01-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P54_133_26""","""P66_157_171""","""P97_36_170""","""P54_133_26""","""P45_57_146""","""P209_127_106""","""P54_133_26""","""P66_157_171""",2,2,2,2,2,2,2,2,,,,…,0.5,0.5,,0.0,2.333333,5.333333,5.0,1.0,3.0,8.0,,1.0,,,2,"""FRIEND""",,2,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,2
6,,,20000.0,1991-01-01,1991-01-01,2013-09-15,1991-01-01,1991-01-01,2013-09-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P82_154_182""","""P164_28_170""","""P97_36_170""","""P82_154_182""","""P164_28_170""","""P10_39_147""","""P112_89_137""","""P10_68_40""",2,2,2,2,2,2,2,2,0.0,,,…,1.0,1.0,,0.0,3.0,5.333333,5.0,1.0,3.666667,5.333333,,1.0,,,2,"""SIBLING""",,2,,,2,"""PE""","""CL""",3,,"""FULL""",2,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
7,,,46000.0,1993-09-01,,2018-09-15,1993-09-01,,2018-09-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P170_34_169""","""P44_33_176""","""P33_146_175""","""P54_133_26""","""P45_57_146""","""P209_127_106""","""P170_34_169""","""P44_33_176""",2,2,2,2,2,2,2,2,,,,…,1.0,1.0,,0.0,3.0,5.333333,5.0,1.0,3.666667,5.333333,,1.0,,,2,"""FRIEND""",,2,,,2,"""PE""","""EM""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
8,,,90000.0,1982-11-01,,2016-05-15,1982-11-01,,2016-05-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P93_57_167""","""P182_156_59""","""P97_36_170""","""P109_162_152""","""P45_25_38""","""P10_39_147""","""P93_57_167""","""P182_156_59""",2,2,2,2,2,2,2,2,,,,…,1.0,1.0,,0.0,3.0,5.333333,5.0,1.0,3.666667,5.333333,,1.0,,,2,"""FRIEND""",,2,,,2,"""PE""","""EM""",3,,,1,,True,2,,"""M""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
9,,,100000.0,1949-10-01,,,1949-10-01,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P181_116_184""","""P42_117_149""","""P33_146_175""","""a55475b1""","""a55475b1""","""P209_127_106""","""P181_116_184""","""P42_117_149""",2,2,2,1,1,2,2,2,,,,…,0.5,0.5,1.0,0.0,3.0,8.0,5.0,1.0,3.0,8.0,5.0,1.0,"""CHILD""",,2,"""CHILD""",,2,False,,2,"""PE""","""CL""",2,,,1,,True,2,,"""M""",2,"""PHONE""","""PRIMARY_MOBILE…",2,1


In [22]:
data = data_base.join(fea_agg, how="left", on="case_id")

In [23]:
data.head(10)

case_id,date_decision,MONTH,WEEK_NUM,target,IS_TRAIN,var_mainoccupationinc_384A,last_mainoccupationinc_384A,first_mainoccupationinc_384A,max_birth_259D,max_birthdate_87D,max_empl_employedfrom_271D,min_birth_259D,min_birthdate_87D,min_empl_employedfrom_271D,last_contaddr_district_15M,last_contaddr_zipcode_807M,last_education_927M,last_empladdr_district_926M,last_empladdr_zipcode_114M,last_language1_981M,last_registaddr_district_1083M,last_registaddr_zipcode_184M,first_contaddr_district_15M,first_contaddr_zipcode_807M,first_education_927M,first_empladdr_district_926M,first_empladdr_zipcode_114M,first_language1_981M,first_registaddr_district_1083M,first_registaddr_zipcode_184M,n_unique_contaddr_district_15M,n_unique_contaddr_zipcode_807M,n_unique_education_927M,n_unique_empladdr_district_926M,n_unique_empladdr_zipcode_114M,n_unique_language1_981M,…,mean_personindex_1023L,var_personindex_1023L,last_personindex_1023L,first_personindex_1023L,mean_persontype_1072L,var_persontype_1072L,last_persontype_1072L,first_persontype_1072L,mean_persontype_792L,var_persontype_792L,last_persontype_792L,first_persontype_792L,last_relationshiptoclient_415T,first_relationshiptoclient_415T,n_unique_relationshiptoclient_415T,last_relationshiptoclient_642T,first_relationshiptoclient_642T,n_unique_relationshiptoclient_642T,last_remitter_829L,first_remitter_829L,n_unique_remitter_829L,last_role_1084L,first_role_1084L,n_unique_role_1084L,last_role_993L,first_role_993L,n_unique_role_993L,last_safeguarantyflag_411L,first_safeguarantyflag_411L,n_unique_safeguarantyflag_411L,last_sex_738L,first_sex_738L,n_unique_sex_738L,last_type_25L,first_type_25L,n_unique_type_25L,max_num_group1
i64,date,i64,i64,i64,i32,f64,f64,f64,date,date,date,date,date,date,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,u32,u32,u32,u32,u32,u32,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,u32,str,str,u32,bool,bool,u32,str,str,u32,str,str,u32,bool,bool,u32,str,str,u32,str,str,u32,i64
0,2019-01-03,201901,0,0,1,,,10800.0,1986-07-01,,2017-09-15,1986-07-01,,2017-09-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P88_18_84""","""P167_100_165""","""P97_36_170""","""P142_57_166""","""P167_100_165""","""P10_39_147""","""P88_18_84""","""P167_100_165""",2,2,2,2,2,2,…,1.0,1.0,,0.0,2.75,4.25,5.0,1.0,3.333333,4.333333,,1.0,,,3,"""COLLEAGUE""",,3,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
1,2019-01-03,201901,0,0,1,,,10000.0,1957-08-01,,2008-10-29,1957-08-01,,2008-10-29,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P103_93_94""","""P176_37_166""","""P97_36_170""","""P49_46_174""","""P160_59_140""","""P10_39_147""","""P103_93_94""","""P176_37_166""",2,2,2,2,2,2,…,1.0,1.0,,0.0,2.6,4.8,5.0,1.0,3.666667,5.333333,,1.0,,,3,"""OTHER_RELATIVE…",,3,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""M""",2,"""PHONE""","""PRIMARY_MOBILE…",3,4
2,2019-01-04,201901,0,0,1,,,14000.0,1974-12-01,,2010-02-15,1974-12-01,,2010-02-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P91_125_184""","""P161_5_97""","""P97_36_170""","""P63_14_24""","""P161_5_97""","""P10_39_147""","""P91_125_184""","""P161_5_97""",2,2,2,2,2,2,…,1.0,1.0,,0.0,2.4,3.8,4.0,1.0,3.333333,4.333333,,1.0,,,3,"""SPOUSE""",,3,,,2,"""PE""","""EM""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",3,4
3,2019-01-03,201901,0,0,1,,,10000.0,1993-08-01,,2018-05-15,1993-08-01,,2018-05-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P155_139_77""","""P161_14_174""","""P33_146_175""","""P131_33_167""","""P161_14_174""","""P10_39_147""","""P155_139_77""","""P161_14_174""",2,2,2,2,2,2,…,0.5,0.5,,0.0,2.0,3.0,4.0,1.0,2.5,4.5,,1.0,,,2,"""SPOUSE""",,2,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,2
4,2019-01-04,201901,0,1,1,,,24000.0,1994-01-01,,2014-12-15,1994-01-01,,2014-12-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P194_82_174""","""P8_88_79""","""P33_146_175""","""P62_144_102""","""P8_88_79""","""P10_39_147""","""P194_82_174""","""P8_88_79""",2,2,2,2,2,2,…,1.0,1.0,,0.0,3.0,5.333333,5.0,1.0,3.666667,5.333333,,1.0,,,3,"""SIBLING""",,3,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
5,2019-01-02,201901,0,0,1,,,64000.0,1979-10-01,,2016-01-15,1979-10-01,,2016-01-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P54_133_26""","""P66_157_171""","""P97_36_170""","""P54_133_26""","""P45_57_146""","""P209_127_106""","""P54_133_26""","""P66_157_171""",2,2,2,2,2,2,…,0.5,0.5,,0.0,2.333333,5.333333,5.0,1.0,3.0,8.0,,1.0,,,2,"""FRIEND""",,2,,,2,"""PE""","""CL""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,2
6,2019-01-03,201901,0,0,1,,,20000.0,1991-01-01,1991-01-01,2013-09-15,1991-01-01,1991-01-01,2013-09-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P82_154_182""","""P164_28_170""","""P97_36_170""","""P82_154_182""","""P164_28_170""","""P10_39_147""","""P112_89_137""","""P10_68_40""",2,2,2,2,2,2,…,1.0,1.0,,0.0,3.0,5.333333,5.0,1.0,3.666667,5.333333,,1.0,,,2,"""SIBLING""",,2,,,2,"""PE""","""CL""",3,,"""FULL""",2,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
7,2019-01-03,201901,0,0,1,,,46000.0,1993-09-01,,2018-09-15,1993-09-01,,2018-09-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P170_34_169""","""P44_33_176""","""P33_146_175""","""P54_133_26""","""P45_57_146""","""P209_127_106""","""P170_34_169""","""P44_33_176""",2,2,2,2,2,2,…,1.0,1.0,,0.0,3.0,5.333333,5.0,1.0,3.666667,5.333333,,1.0,,,2,"""FRIEND""",,2,,,2,"""PE""","""EM""",3,,,1,,True,2,,"""F""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
8,2019-01-03,201901,0,0,1,,,90000.0,1982-11-01,,2016-05-15,1982-11-01,,2016-05-15,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P93_57_167""","""P182_156_59""","""P97_36_170""","""P109_162_152""","""P45_25_38""","""P10_39_147""","""P93_57_167""","""P182_156_59""",2,2,2,2,2,2,…,1.0,1.0,,0.0,3.0,5.333333,5.0,1.0,3.666667,5.333333,,1.0,,,2,"""FRIEND""",,2,,,2,"""PE""","""EM""",3,,,1,,True,2,,"""M""",2,"""PHONE""","""PRIMARY_MOBILE…",2,3
9,2019-01-03,201901,0,0,1,,,100000.0,1949-10-01,,,1949-10-01,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""P181_116_184""","""P42_117_149""","""P33_146_175""","""a55475b1""","""a55475b1""","""P209_127_106""","""P181_116_184""","""P42_117_149""",2,2,2,1,1,2,…,0.5,0.5,1.0,0.0,3.0,8.0,5.0,1.0,3.0,8.0,5.0,1.0,"""CHILD""",,2,"""CHILD""",,2,False,,2,"""PE""","""CL""",2,,,1,,True,2,,"""M""",2,"""PHONE""","""PRIMARY_MOBILE…",2,1


In [None]:
# def derived_features_between_cols(df, cols_list):
#     for item in cols_list:
#         print(item)
#         col1 = item[0]
#         col2 = item[1]
#
#         df = df.with_columns((pl.col(col2) - pl.col(col1)).alias(col2 + '_minus_' + col1))
#         df = df.with_columns(operator.truediv(pl.col(col2), pl.col(col1)).alias(col2 + '_div_' + col1))
#
#     return df
#
# data = derived_features_between_cols(data, derive_list)

In [24]:
def handle_date(df):
    for col in df.columns:
        if col[-1] in ("D",) and df[col].dtype == pl.Date:
            print(f'col [{col}] is date...')
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days())

    # df = df.drop("date_decision", "MONTH")
    return df


def handle_category(df):
    dt_train = df.filter(pl.col('IS_TRAIN') == 1)
    cols_list = df.columns
    for col in cols_list:
        if ((col[-1] in ("M",)) and ('WEEK_NUM' not in col) and (not col.startswith('n_unique'))) \
                or (col[-1] in ("L", "P") and (df[col].dtype == pl.String) or df[col].dtype == pl.Boolean):
            freq = dt_train[col].n_unique()
            if freq <= 10:
                print(f'col [{col}] one hot encoded...')
                encoder = OneHotEncoder(cols=[col], handle_unknown='indicator', handle_missing='indicator', use_cat_names=True)
                encoder.fit(dt_train[col].to_pandas())
                res_df = pl.from_pandas(encoder.transform(df[col].to_pandas()))
                df = pl.concat([df, res_df], how="horizontal")
                df = df.drop(col)
            else:
                print(f'col [{col}] target encoded...')
                encoder = TargetEncoder(cols=[col], handle_unknown='value', handle_missing='value') \
                            .fit(dt_train[col].to_pandas(), dt_train['target'].to_pandas())
                df = df.with_columns(pl.from_pandas(encoder.transform(df[col].to_pandas())).to_series().alias(col + '_target_encoder'))
                df = df.drop(col)

    return df

In [25]:
data = data.pipe(handle_category)

col [last_contaddr_district_15M] target encoded...
col [last_contaddr_zipcode_807M] target encoded...
col [last_education_927M] one hot encoded...
col [last_empladdr_district_926M] target encoded...
col [last_empladdr_zipcode_114M] target encoded...
col [last_language1_981M] one hot encoded...
col [last_registaddr_district_1083M] target encoded...
col [last_registaddr_zipcode_184M] target encoded...
col [first_contaddr_district_15M] target encoded...
col [first_contaddr_zipcode_807M] target encoded...
col [first_education_927M] one hot encoded...
col [first_empladdr_district_926M] target encoded...
col [first_empladdr_zipcode_114M] target encoded...
col [first_language1_981M] one hot encoded...
col [first_registaddr_district_1083M] target encoded...
col [first_registaddr_zipcode_184M] target encoded...
col [last_contaddr_matchlist_1032L] one hot encoded...
col [first_contaddr_matchlist_1032L] one hot encoded...
col [last_contaddr_smempladdr_334L] one hot encoded...
col [first_contaddr_

In [26]:
data = data.pipe(handle_date)

col [max_birth_259D] is date...
col [max_birthdate_87D] is date...
col [max_empl_employedfrom_271D] is date...
col [min_birth_259D] is date...
col [min_birthdate_87D] is date...
col [min_empl_employedfrom_271D] is date...


In [27]:
data.shape

(1526669, 254)

In [28]:
def filter_cols(df):
    dt_train = df.filter(pl.col('IS_TRAIN') == 1)
    for col in df.columns:
        if col not in ["case_id", "WEEK_NUM", "date_decision", "MONTH", "IS_TRAIN", "target"]:
            isnull = dt_train[col].is_null().mean()

            if isnull > 0.95:
                print(f'col [{col}] to be null dropped...')
                num_positive = dt_train.filter(pl.col(col).is_not_null() & pl.col("target") == 1).shape[0]
                if num_positive < 1000:
                    df = df.drop(col)
                    print(f'col [{col}] null dropped...')

    for col in df.columns:
        if (col not in ["case_id", "WEEK_NUM", "date_decision", "MONTH", "IS_TRAIN", "target"]) & (df[col].dtype == pl.String):
            freq = dt_train[col].n_unique()

            if freq == 1:
                df = df.drop(col)
                print(f'col [{col}] freq dropped...')

    return df

In [29]:
data = data.pipe(filter_cols)

col [max_birthdate_87D] to be null dropped...
col [max_birthdate_87D] null dropped...
col [min_birthdate_87D] to be null dropped...
col [min_birthdate_87D] null dropped...
col [mean_childnum_185L] to be null dropped...
col [mean_childnum_185L] null dropped...
col [last_childnum_185L] to be null dropped...
col [last_childnum_185L] null dropped...
col [first_childnum_185L] to be null dropped...
col [first_childnum_185L] null dropped...
col [first_relationshiptoclient_415T] to be null dropped...
col [first_relationshiptoclient_415T] null dropped...
col [first_relationshiptoclient_642T] to be null dropped...
col [first_relationshiptoclient_642T] null dropped...


In [33]:
data.head(10)

case_id,IS_TRAIN,var_mainoccupationinc_384A,last_mainoccupationinc_384A,first_mainoccupationinc_384A,max_birth_259D,max_empl_employedfrom_271D,min_birth_259D,min_empl_employedfrom_271D,n_unique_contaddr_district_15M,n_unique_contaddr_zipcode_807M,n_unique_education_927M,n_unique_empladdr_district_926M,n_unique_empladdr_zipcode_114M,n_unique_language1_981M,n_unique_registaddr_district_1083M,n_unique_registaddr_zipcode_184M,var_childnum_185L,n_unique_contaddr_matchlist_1032L,n_unique_contaddr_smempladdr_334L,n_unique_empl_employedtotal_800L,n_unique_empl_industry_691L,n_unique_familystate_447L,n_unique_gender_992L,n_unique_housetype_905L,n_unique_housingtype_772L,last_incometype_1044T,first_incometype_1044T,n_unique_incometype_1044T,n_unique_isreference_387L,n_unique_maritalst_703L,mean_personindex_1023L,var_personindex_1023L,last_personindex_1023L,first_personindex_1023L,mean_persontype_1072L,var_persontype_1072L,…,last_role_993L_-1,first_role_993L_nan,first_role_993L_FULL,first_role_993L_-1,last_safeguarantyflag_411L_nan,last_safeguarantyflag_411L_True,last_safeguarantyflag_411L_False,last_safeguarantyflag_411L_-1,first_safeguarantyflag_411L_True,first_safeguarantyflag_411L_False,first_safeguarantyflag_411L_nan,first_safeguarantyflag_411L_-1,last_sex_738L_nan,last_sex_738L_F,last_sex_738L_M,last_sex_738L_-1,first_sex_738L_F,first_sex_738L_M,first_sex_738L_nan,first_sex_738L_-1,last_type_25L_PHONE,last_type_25L_PRIMARY_MOBILE,last_type_25L_HOME_PHONE,last_type_25L_nan,last_type_25L_ALTERNATIVE_PHONE,last_type_25L_SECONDARY_MOBILE,last_type_25L_PRIMARY_EMAIL,last_type_25L_TWITTER,last_type_25L_-1,first_type_25L_PRIMARY_MOBILE,first_type_25L_PHONE,first_type_25L_SECONDARY_MOBILE,first_type_25L_HOME_PHONE,first_type_25L_WHATSAPP,first_type_25L_PRIMARY_EMAIL,first_type_25L_nan,first_type_25L_-1
i64,i32,f64,f64,f64,i64,i64,i64,i64,u32,u32,u32,u32,u32,u32,u32,u32,f64,u32,u32,u32,u32,u32,u32,u32,u32,str,str,u32,u32,u32,f64,f64,f64,f64,f64,f64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,1,,,10800.0,-11874,-475.0,-11874,-475.0,2,2,2,2,2,2,2,2,,2,2,2,2,2,1,1,1,,"""SALARIED_GOVT""",2,1,1,1.0,1.0,,0.0,2.75,4.25,…,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,1,,,10000.0,-22435,-3718.0,-22435,-3718.0,2,2,2,2,2,2,2,2,,2,2,2,2,2,1,1,1,,"""SALARIED_GOVT""",2,1,1,1.0,1.0,,0.0,2.6,4.8,…,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,1,,,14000.0,-16105,-3245.0,-16105,-3245.0,2,2,2,2,2,2,2,2,,2,2,2,2,2,1,1,1,,"""EMPLOYED""",2,1,1,1.0,1.0,,0.0,2.4,3.8,…,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,1,,,10000.0,-9286,-233.0,-9286,-233.0,2,2,2,2,2,2,2,2,,2,2,2,2,2,1,1,1,,"""EMPLOYED""",2,1,1,0.5,0.5,,0.0,2.0,3.0,…,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,1,,,24000.0,-9134,-1481.0,-9134,-1481.0,2,2,2,2,2,2,2,2,,2,2,2,2,2,1,1,1,,"""EMPLOYED""",2,1,1,1.0,1.0,,0.0,3.0,5.333333,…,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,1,,,64000.0,-14338,-1083.0,-14338,-1083.0,2,2,2,2,2,2,2,2,,2,2,2,2,2,1,1,1,,"""PRIVATE_SECTOR…",2,1,1,0.5,0.5,,0.0,2.333333,5.333333,…,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
6,1,,,20000.0,-10229,-1936.0,-10229,-1936.0,2,2,2,2,2,2,2,2,,2,2,2,2,2,2,1,2,,"""EMPLOYED""",2,3,2,1.0,1.0,,0.0,3.0,5.333333,…,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7,1,,,46000.0,-9255,-110.0,-9255,-110.0,2,2,2,2,2,2,2,2,,2,2,2,2,2,1,1,1,,"""SALARIED_GOVT""",2,1,1,1.0,1.0,,0.0,3.0,5.333333,…,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8,1,,,90000.0,-13212,-963.0,-13212,-963.0,2,2,2,2,2,2,2,2,,2,2,2,2,2,1,1,1,,"""EMPLOYED""",2,1,1,1.0,1.0,,0.0,3.0,5.333333,…,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,1,,,100000.0,-25296,,-25296,,2,2,2,1,1,2,2,2,,2,2,1,1,2,1,1,1,,"""RETIRED_PENSIO…",2,1,1,0.5,0.5,1.0,0.0,3.0,8.0,…,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [31]:
data = data.drop('date_decision', 'MONTH', 'WEEK_NUM', 'target')

In [32]:
data.shape

(1526669, 243)

In [34]:
preprocess_dir_path = "/Users/sophie/workspace/data/kaggle/home-credit-credit-risk-model-stability/preprocess"
preprocess_file = 'person_1.parquet'
preprocess_file_path = Path(os.path.join(preprocess_dir_path, preprocess_file))
data.write_parquet(preprocess_file_path)