In [None]:
# https://blog.csdn.net/s09094031/article/details/92428209?app_version=6.3.1&csdn_share_tail=%7B%22type%22%3A%22blog%22%2C%22rType%22%3A%22article%22%2C%22rId%22%3A%2292428209%22%2C%22source%22%3A%22unlogin%22%7D&utm_source=app

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

ROOT = Path('/Users/dustinhayes/Desktop/GitHub/stable-credit-risk-modeling/Data')

In [2]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [3]:
class Pipeline:

    def set_table_dtypes(df):
        """
        Set data types.

        Used in the data loading pipeline to set data types
        for efficiency.

        "case_id", "WEEK_NUM", "num_group1", "num_group2" are special
        columns. case_id is the identifier for each case, WEEK_NUM indicates
        the week that an observation was taken, and num_group1 and num_group2
        are indexes used in depth=1 and depth=2 tables. Int is suitable for each.

        date_decision is likewise special: it denotes the date that the choice
        to either deny or issue the loan was made.

        Datatypes for remaining columns are selected based on the last character
        in the column name:

        P - Transform DPD (Days past due) - Float
        M - Masking categories - String
        A - Transform amount - Float
        D - Transform date - Date
        T - Unspecified Transform - Not handled
        L - Unspecified Transform - Not handled
        """
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        """
        Convert date values to a # days difference from date decision.

        This function locates date columns (ending in D) and converts
        to a # of days difference between the date decision and the date of interest.
        """
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  # Duration
                df = df.with_columns(pl.col(col).dt.total_days()) # Count of days
        # Polars handles conversion to int automatically
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        """
        Filters columns based on # of null values and frequency of categorical values,
        with exceptions for specific columns.

        Logic:
            - Drop if greater than 70% of the column is null and not in "target", "case_id", "WEEK_NUM"
            - If categorical and not in "target", "case_id", "WEEK_NUM":
                - Drop if column has only one unique value (not informative)
                - Drop if column has more than 200 unique values (high cardinality, expensive and
                may lead to overfitting)
        """
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df

In [4]:

class Aggregator:
    """
    A namespace for a set of functions used to aggregate data with depth > 0.

    Data with depth > 0 has multiple values per case_id, for instance, temporal data
    measured week by week related to a particular case_id. This data must be aggregated
    before an ML algorithm can be run on it.

    Each of these methods returns a list of expresions which apply to different data
    types.
    """
    def num_expr(df):
        """
        Aggregate numerical data.

        This method returns a list of expressions which calculate
        the max value, last (most recent) value, and the mean.
        """
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max +expr_last+expr_mean
    
    def date_expr(df):
        """
        Aggregate date data.

        This method returns a list of expressions which calculate
        the max date, last date, and mean date. Note that dates will have
        been converted to an int representing # of days from decision date.

        Question: Why are last date and max date not the same?
        """
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return  expr_max +expr_last+expr_mean
    
    def str_expr(df):
        """
        Aggregate categorical data.

        This function is more confusing for me. It seems to return an expression
        which retrieves the "max" string and the "last" string.

        I don't see how the "max" string, which I am quite sure would just
        be the last string when ordered alphabetically, would be useful.
        """
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        #expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return  expr_max +expr_last#+expr_count
    
    def other_expr(df):
        """
        Aggregate other data.

        Returns an expression which gets the max and last values in the
        supplied column.
        """
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return  expr_max +expr_last
    
    def count_expr(df):
        """
        I'll have to come back to see what this does.
        """
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols] 
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return  expr_max +expr_last
    
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [5]:
def read_file(path, depth=None):
    """
    Reads a parquet file than performs data processing steps.

    First, type setting is applied using Pipeline.set_table_dtypes.
    Then, if depth == 1 or depth == 2, indicating that multiple records may be included
    for each case_id, it groups the DataFrame by 'case_id' and then aggregates it using 
    expressions generated by the get_exprs method from the Aggregator class.
    """
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    """
    Read multiple files and performs data processing steps.

    Similar to read_file, this function read multiple files. It reads each file,
    performs the same aggregations, appends the result to "chunks", then
    concatenates all dfs such that a single df is returned.
    """
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

In [6]:
def feature_eng(df_base, depth_0, depth_1, depth_2):
    """
    Performs an additional feature engineering step and joins tables.

    This functino first adds two columns to the base_df passed in: the month
    that the decision was made and the day that the decision was made. Both
    values will be recorded as an integer. 

    This function also joins the base_df with the tables passed in
    to depth_0, depth_1, depth_2 and returns the combination of all
    tables.
    """
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

In [7]:
def to_pandas(df_data, cat_cols=None):
    """
    Convert a polars df to a pandas df.

    The main purpose of this function is to accept a polars
    dataframe and return a pandas dataframe. This function may
    be used to leverage functionality that exists in pandas but
    not polars.
    
    Additionally, this function accepts a parameter cat_cols,
    which may be used to identify which columns should be converted to the
    "category" datatype. If cat_cols is None, it is assumed that all
    columns with datatype "object" should be converted to "category".
    """
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

In [8]:
def reduce_mem_usage(df):
    """
    Iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.

    This function operates under the principle that a numerical column
    should use the more memory efficient datatype possible. For example,
    if the largest and smallest int in a column can be represented with int8, we should
    use int8 instead of a of larger type such as int16. It performs similar operations
    on floats such that the most memory efficient datatype is employed.

    "category" and "object" types are skipped as they are not suitable for this
    sort of type casting.

    The memory usage of the dataframe is recorded and printed before and after
    the operation, allowing the user to know the extent to which memory usage was
    optimized.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [9]:
TRAIN_DIR       = ROOT / "original_parquet_files" / "train"
TEST_DIR        = ROOT / "original_parquet_files" / "test"

In [13]:

# A dictionary object containing all training data, organized by depth.
data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

We now initialize the training set using the feature_eng function, which accepts the data_store unpacked into key, value pairs corresponding to the the base dataframe, depth 0 dataframes, depth 1 dataframes, and depth 2 dataframes. feature_eng will return a concatonation of all dataframes in the data_store after handling dates using Pipeline.handle_dates. We then delete the data_store and collect garbage to save memory.

Pipeline.filter_cols is then called on the training set to remove columns with many nulls as well as categorical columns with too few or too many unique values.

df_train is then converted to pandas. This is presumably done to take advantage of functionality that which pandas posseses but polars does not.



In [16]:
df_train = feature_eng(**data_store)
print("train data shape:\t", df_train.shape)
# del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols) # This removes about half of all cols
df_train, cat_cols = to_pandas(df_train) # Sets cat_cols to be a list of "object" cols in the set
df_train = reduce_mem_usage(df_train)
print("train data shape:\t", df_train.shape)
nums=df_train.select_dtypes(exclude='category').columns # Grab numerical cols
from itertools import combinations, permutations
#df_train=df_train[nums]
nans_df = df_train[nums].isna() # Boolean mask for is na
nans_groups={}
# Iterate through all numerical cols to create a dict (nans_groups)
# Where Keys are the indicate a number of nulls, and values are a list of
# columns with that many nulls.
for col in nums:
    cur_group = nans_df[col].sum() # Count number of nan
    try:
        nans_groups[cur_group].append(col) 
    except:
        nans_groups[cur_group]=[col]
del nans_df; x=gc.collect()

def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = df_train[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
            #print(str(gg)+'-'+str(n),', ',end='')
        use.append(vx)
        #print()
    print('Use these',use)
    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    # 计算列之间的相关性
    correlation_matrix = matrix.corr() # a col # by col # matrix, where each entry is correlation between numeric cols

    # 分组列
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold: # If correlation between c and col > threshold
                group.append(c) # Add c to group (col is already in there)
                correlated_cols.append(c)
        groups.append(group)
        # Remove cols that have been grouped
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
    return groups # groups will be a list where each element is the list of features that
                  # are have correlation greater than the threshold. We may want to explore the logic
                  # here a bit more on the basis of this question: If the corr between A and B and the corr
                  # between A and C is X, what can be said of the correlation between B and C? i.e. there
                  # may be a dependency on the order of the input list.

uses=[]
for k,v in nans_groups.items():
    if len(v)>1: # If more than one feature has a particular amount of nulls
            Vs = nans_groups[k]
            #cross_features=list(combinations(Vs, 2))
            #make_corr(Vs)
            grps= group_columns_by_correlation(df_train[Vs], threshold=0.8)
            use=reduce_group(grps)
            uses=uses+use
            #make_corr(use)
    else:
        uses=uses+v
    print('####### NAN count =',k)
print(uses)
print(len(uses))
uses=uses+list(df_train.select_dtypes(include='category').columns)
print(len(uses))
df_train=df_train[uses]

train data shape:	 (1526659, 861)
Memory usage of dataframe is 4322.75 MB
Memory usage after optimization is: 1528.81 MB
Decreased by 64.6%
train data shape:	 (1526659, 472)


In [None]:
sample = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv")
device='gpu'
#n_samples=200000
DRY_RUN = True if sample.shape[0] == 10 else False   
if DRY_RUN:
    device='cpu'
    df_train = df_train.iloc[:50000]
    #n_samples=10000
print(device)

In [None]:
data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

In [None]:
df_test = feature_eng(**data_store)
print("test data shape:\t", df_test.shape)
del data_store
gc.collect()
df_test = df_test.select([col for col in df_train.columns if col != "target"])
print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

df_test, cat_cols = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)

gc.collect()

### Feature Selection

In [None]:
# drop_list = ['max_empl_employedtotal_800L', 'monthsannuity_845L', 'lastactivateddate_801D', 
#              'max_numberofoverdueinstls_725L', 'requesttype_4525192L', 'max_pmts_year_507T', 
#              'lastrejectcommodtypec_5251769M', 'numinstpaidlate1d_3546852L', 'numinstmatpaidtearly2d_4499204L', 
#              'max_overdueamountmaxdateyear_2T', 'max_overdueamountmaxdateyear_994T', 'twobodfilling_608L', 
#              'maxdpdlast12m_727P', 'numinsttopaygrest_4493213L', 'currdebtcredtyperange_828A', 'maxdpdlast9m_1059P', 
#              'numinstpaid_4499208L', 'applicationscnt_867L', 'numinstlswithoutdpd_562L', 'fourthquarter_440L', 
#              'max_num_group1_6', 'max_safeguarantyflag_411L', 'max_dpdmaxdateyear_896T', 'numinstregularpaid_973L', 
#              'avgdbdtollast24m_4525197P', 'numinstpaidearly5dest_4493211L', 'numinstpaidearly5dobd_4499205L', 
#              'homephncnt_628L', 'max_role_1084L', 'max_remitter_829L', 'numrejects9m_859L', 
#              'numinstlallpaidearly3d_817L', 'numinstpaidearly3dest_4493216L', 'annuitynextmonth_57A', 
#              'numinstregularpaidest_4493210L', 'firstquarter_103L', 'clientscnt_533L', 'maxdpdlast3m_392P', 
#              'sellerplacescnt_216L', 'secondquarter_766L', 'max_periodicityofpmts_1102L', 'numinstlsallpaid_934L', 
#              'opencred_647L', 'numinstls_657L', 'numactivecredschannel_414L', 'numinstpaidearly3d_3546850L', 
#              'numinstpaidearlyest_4493214L', 'max_totaldebtoverduevalue_718A', 'paytype1st_925L', 
#              'max_inittransactioncode_279L', 'max_contractst_545M', 'max_cancelreason_3545846M', 
#              'max_rejectreason_755M', 'max_personindex_1023L', 'max_subjectroles_name_838M', 'maxdpdlast6m_474P', 
#              'max_subjectrole_182M', 'actualdpdtolerance_344P', 'max_num_group1_9', 'max_collaterals_typeofguarante_669M', 
#              'numinstpaidearly_338L', 'clientscnt_887L', 'maritalst_893M', 'max_subjectrole_93M', 'max_type_25L', 
#              'max_refreshdate_3813885D', 'numinstpaidearly5d_1087L', 'max_actualdpd_943P', 'max_description_351M', 
#              'education_88M', 'clientscnt_946L', 'clientscnt12m_3712952L', 'numactiverelcontr_750L', 
#              'max_education_927M', 'applicationscnt_1086L', 'sellerplacecnt_915L', 'max_purposeofcred_426M', 
#              'max_subjectroles_name_541M', 'clientscnt_1022L', 'clientscnt_360L', 'max_totaloutstanddebtvalue_668A', 
#              'applicationscnt_629L', 'max_outstandingamount_354A', 'clientscnt_1071L', 'numactivecreds_622L', 
#              'clientscnt_493L', 'paytype_783L', 'clientscnt6m_3712949L', 'clientscnt_304L', 'max_classificationofcontr_13M', 
#              'numnotactivated_1143L', 'commnoinclast6m_3546845L', 'max_numberofoutstandinstls_520L', 
#              'applicationscnt_464L', 'clientscnt_1130L', 'max_numberofoverdueinstls_834L', 'clientscnt3m_3712950L', 
#              'max_rejectreasonclient_4145042M', 'max_contaddr_smempladdr_334L', 'numpmtchanneldd_318L', 
#              'numcontrs3months_479L', 'max_overdueamount_31A', 'max_collaterals_typeofguarante_359M', 
#              'clientscnt_257L', 'clientscnt_157L', 'applications30d_658L', 'clientscnt_100L', 
#              'max_collater_typofvalofguarant_298M', 'max_pmts_month_706T', 'max_pmts_month_158T', 
#              'mastercontrexist_109L', 'max_collater_typofvalofguarant_407M', 'mastercontrelectronic_519L', 
#              'applicationcnt_361L', 'max_persontype_1072L', 'max_empladdr_district_926M', 'deferredmnthsnum_166L', 
#              'max_empladdr_zipcode_114M', 'max_persontype_792L', 'max_contaddr_matchlist_1032L']

In [None]:
# df_train = df_train.drop(drop_list)
# df_test = df_test.drop(drop_list)

### Handle categorical features (Ordinal encoding)

In [None]:
# cat_list = [col for col in df_train.columns if df_train[col].dtype.name == 'category']

# catfreq_dict = {}
# catcatfreq_dict = {}

# for col in cat_list:
#     catfreq_dict[col] = len(list(df_train[col].value_counts()))
#     catcatfreq_dict[col] = {}
#     for d in dict(df_train[col].value_counts()).items():
#         catcatfreq_dict[col][d[0]] = d[1]

# catfreq_df = pd.DataFrame.from_dict(catfreq_dict, orient='index', columns=['Categories'])
# display(catfreq_df.sort_values(by="Categories", ascending=False).head())
# display(catfreq_df.sort_values(by="Categories", ascending=True).head())

In [None]:
# ordinal_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
# df_train[cat_list] = ordinal_enc.fit_transform(df_train[cat_list])
# df_test[cat_list] = ordinal_enc.transform(df_test[cat_list])
# df_train[cat_list].head()

### Handle NaN

In [None]:
# nan_list = []
# for col, boo in df_train.isnull().any().items():
#     if boo == True:
#         nan_list.append(col)

# print(f"Number of col contains Nan value: {len(nan_list)}")
# for i, v in df_train.isna().sum().items():
#     if v/len(df_train)>0.6:
#         print(f"{i} : \t {round((v/len(df_train))*100)}% Nan ")

In [None]:
# ### trial
# from sklearn.impute import SimpleImputer
# imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# df_train[nan_list] = imp.fit_transform(df_train[nan_list])
# df_test[nan_list] = imp.transform(df_test[nan_list])

In [None]:
# ## no work (too slow..) 
# ## require dimensionality reduction first & features with high feature impor. 
# imputer = KNNImputer()
# df_train = imputer.fit_transform(df_train)

In [None]:
y = df_train["target"]
weeks = df_train["WEEK_NUM"]
df_train= df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

In [None]:

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device, 
    "verbose": -1,
}

fitted_models = []
cv_scores = []


for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):#   Because it takes a long time to divide the data set, 
    X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]# each time the data set is divided, two models are trained to each other twice, which saves time.
    X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
    fitted_models.append(model)
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores.append(auc_score)
    
print("CV AUC scores: ", cv_scores)
print("Maximum CV AUC score: ", max(cv_scores))

In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models)

In [None]:
lgb.plot_importance(fitted_models[2], importance_type="split", figsize=(10,50))
plt.show()

In [None]:
features = X_train.columns
importances = fitted_models[2].feature_importances_
feature_importance = pd.DataFrame({'importance':importances,'features':features}).sort_values('importance', ascending=False).reset_index(drop=True)
feature_importance

drop_list = []
for i, f in feature_importance.iterrows():
    if f['importance']<80:
        drop_list.append(f['features'])
print(f"Number of features which are not important: {len(drop_list)} ")

print(drop_list)

### Submission

In [None]:
df_test = df_test.drop(columns=["WEEK_NUM"])
df_test = df_test.set_index("case_id")


In [None]:
y_pred = pd.Series(model.predict_proba(df_test)[:, 1], index=df_test.index)
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred
df_subm.to_csv("submission.csv")
df_subm

In [None]:
#X_test = df_test.drop(columns=["WEEK_NUM"])
#X_test = X_test.set_index("case_id")

#lgb_pred = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

#df_subm = pd.read_csv(ROOT / "sample_submission.csv")
#df_subm = df_subm.set_index("case_id")

#df_subm["score"] = lgb_pred

#df_subm.head()

#df_subm.to_csv("submission.csv")

* handle nan value (both numeric & categorical)
* preprocess the minority categorical value (drop / keep)
* float to int after ordinal encode
* oversampling (smote? (considering date feature)
* high dimension (pca) 
* create new features based on high fea_imp features
...
...

In [None]:
# X_resampled, y_resampled = SMOTE().fit_resample(X, y)