In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv
/kaggle/input/home-credit-credit-risk-model-stability/feature_definitions.csv
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_deposit_1.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_applprev_2.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_static_cb_0.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_static_0_0.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_credit_bureau_a_1_3.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_credit_bureau_a_1_2.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_tax_registry_b_1.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_static_0_2.parquet
/kaggle/input/home-credit-credit-risk-model-st

In [2]:
import gc
import lightgbm as lgb  # type: ignore
import numpy as np  # type: ignore
import pandas as pd  # type: ignore
import polars as pl  # type: ignore
import warnings

from catboost import CatBoostClassifier, Pool  # type: ignore
from glob import glob
from IPython.display import display  # type: ignore
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin  # type: ignore
from sklearn.metrics import roc_auc_score  # type: ignore
from sklearn.model_selection import StratifiedGroupKFold  # type: ignore
from typing import Any
from scipy import stats
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

#!pip install category_encoders
import category_encoders as ce
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder


warnings.filterwarnings("ignore")

ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"

In [3]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str) -> None:
        """
        Retrieves feature definitions from a CSV file based on the specified ending.

        Args:
        - ending_with (str): Ending to filter feature definitions.

        Returns:
        - pl.DataFrame: Filtered feature definitions.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")

        filtered_feats: pl.DataFrame = feat_defs.filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

    @staticmethod
    def find_index(lst: list[Any], item: Any) -> int | None:
        """
        Finds the index of an item in a list.

        Args:
        - lst (list): List to search.
        - item (Any): Item to find in the list.

        Returns:
        - int | None: Index of the item if found, otherwise None.
        """
        try:
            return lst.index(item)
        except ValueError:
            return None

    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        """
        Converts Polars data type to string representation.

        Args:
        - dtype (pl.DataType): Polars data type.

        Returns:
        - str: String representation of the data type.
        """
        dtype_map = {
            pl.Decimal: "Decimal",
            pl.Float32: "Float32",
            pl.Float64: "Float64",
            pl.UInt8: "UInt8",
            pl.UInt16: "UInt16",
            pl.UInt32: "UInt32",
            pl.UInt64: "UInt64",
            pl.Int8: "Int8",
            pl.Int16: "Int16",
            pl.Int32: "Int32",
            pl.Int64: "Int64",
            pl.Date: "Date",
            pl.Datetime: "Datetime",
            pl.Duration: "Duration",
            pl.Time: "Time",
            pl.Array: "Array",
            pl.List: "List",
            pl.Struct: "Struct",
            pl.String: "String",
            pl.Categorical: "Categorical",
            pl.Enum: "Enum",
            pl.Utf8: "Utf8",
            pl.Binary: "Binary",
            pl.Boolean: "Boolean",
            pl.Null: "Null",
            pl.Object: "Object",
            pl.Unknown: "Unknown",
        }

        return dtype_map.get(dtype)

    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        """
        Finds occurrences of features ending with a specific string in Parquet files.

        Args:
        - regex_path (str): Regular expression to match Parquet file paths.
        - ending_with (str): Ending to filter feature names.

        Returns:
        - pl.DataFrame: DataFrame containing feature definitions, data types, and file locations.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv").filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )
        feat_defs.sort(by=["Variable"])

        feats: list[pl.String] = feat_defs["Variable"].to_list()
        feats.sort()

        occurrences: list[list] = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for feat, dtype in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias("Data_Type(s)"))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias("File_Loc(s)"))

        return feat_defs

    def reduce_memory_usage(df, name) :
        """
        Reduces memory usage of a DataFrame by converting column types.

        Args:
        - df (pl.DataFrame): DataFrame to optimize.
        - name (str): Name of the DataFrame.

        Returns:
        - pl.DataFrame: Optimized DataFrame.
        """
        print(
            f"Memory usage of dataframe \"{name}\" is {round(df.estimated_size('mb'), 4)} MB."
        )

        int_types = [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
        ]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in int_types + float_types:
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if (
                                c_min >= np.iinfo(np.uint8).min
                                and c_max <= np.iinfo(np.uint8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif (
                                c_min >= np.iinfo(np.uint16).min
                                and c_max <= np.iinfo(np.uint16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif (
                                c_min >= np.iinfo(np.uint32).min
                                and c_max <= np.iinfo(np.uint32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif (
                                c_min >= np.iinfo(np.uint64).min
                                and c_max <= np.iinfo(np.uint64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if (
                                c_min >= np.iinfo(np.int8).min
                                and c_max <= np.iinfo(np.int8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif (
                                c_min >= np.iinfo(np.int16).min
                                and c_max <= np.iinfo(np.int16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif (
                                c_min >= np.iinfo(np.int32).min
                                and c_max <= np.iinfo(np.int32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif (
                                c_min >= np.iinfo(np.int64).min
                                and c_max <= np.iinfo(np.int64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if (
                            c_min > np.finfo(np.float32).min
                            and c_max < np.finfo(np.float32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(
            f"Memory usage of dataframe \"{name}\" became {round(df.estimated_size('mb'), 4)} MB."
        )

        return df

    def to_pandas(df: pl.DataFrame, cat_cols: list[str] = None) -> (pd.DataFrame, list[str]):  # type: ignore
        """
        Converts a Polars DataFrame to a Pandas DataFrame.

        Args:
        - df (pl.DataFrame): Polars DataFrame to convert.
        - cat_cols (list[str]): List of categorical columns. Default is None.

        Returns:
        - (pd.DataFrame, list[str]): Tuple containing the converted Pandas DataFrame and categorical columns.
        """
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes("object").columns)

        df[cat_cols] = df[cat_cols].astype("str")

        return df, cat_cols

In [4]:
class Aggregator:
    @staticmethod
    def max_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating maximum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for maximum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_max: list[pl.Series] = [
            pl.col(col).max().alias(f"max_{col}") for col in cols
        ]

        return expr_max

    @staticmethod
    def min_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating minimum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for minimum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_min: list[pl.Series] = [
            pl.col(col).min().alias(f"min_{col}") for col in cols
        ]

        return expr_min

    @staticmethod
    def mean_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mean values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mean values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).mean().alias(f"mean_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def var_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating variance for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for variance.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).var().alias(f"var_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def mode_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mode values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mode values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith("M")]

        expr_mode: list[pl.Series] = [
            pl.col(col).drop_nulls().mode().first().alias(f"mode_{col}") for col in cols
        ]

        return expr_mode

    @staticmethod
    def get_exprs(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Combines expressions for maximum, mean, and variance calculations.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of combined expressions.
        """
        exprs = (
            Aggregator.max_expr(df) + Aggregator.mean_expr(df) + Aggregator.var_expr(df)
        )

        return exprs

In [5]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        """
        Changes the data types of columns in the DataFrame.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - pl.LazyFrame: LazyFrame with modified data types.
        """
        for col in df.columns:
            if col == "case_id":
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ["WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == "date_decision" or col[-1] == "D":
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ["P", "A"]:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
        return df

    @staticmethod
    def scan_files(glob_path: str, depth: int = None) -> pl.LazyFrame:
        """
        Scans Parquet files matching the glob pattern and combines them into a LazyFrame.

        Args:
        - glob_path (str): Glob pattern to match Parquet files.
        - depth (int, optional): Depth level for data aggregation. Defaults to None.

        Returns:
        - pl.LazyFrame: Combined LazyFrame.
        """
        chunks: list[pl.LazyFrame] = []
        for path in glob(str(glob_path)):
            df: pl.LazyFrame = pl.scan_parquet(
                path, low_memory=True, rechunk=True
            ).pipe(SchemaGen.change_dtypes)
            print(f"File {Path(path).stem} loaded into memory.")

            if depth in (1, 2):
                exprs: list[pl.Series] = Aggregator.get_exprs(df)
                df = df.group_by("case_id").agg(exprs)

                del exprs
                gc.collect()

            chunks.append(df)

        df = pl.concat(chunks, how="vertical_relaxed")

        del chunks
        gc.collect()

        df = df.unique(subset=["case_id"])

        return df

    @staticmethod
    def join_dataframes(
        df_base: pl.LazyFrame,
        depth_0: list[pl.LazyFrame],
        depth_1: list[pl.LazyFrame],
        depth_2: list[pl.LazyFrame],
    ) -> pl.DataFrame:
        """
        Joins multiple LazyFrames with a base LazyFrame.

        Args:
        - df_base (pl.LazyFrame): Base LazyFrame.
        - depth_0 (list[pl.LazyFrame]): List of LazyFrames for depth 0.
        - depth_1 (list[pl.LazyFrame]): List of LazyFrames for depth 1.
        - depth_2 (list[pl.LazyFrame]): List of LazyFrames for depth 2.

        Returns:
        - pl.DataFrame: Joined DataFrame.
        """
        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
            
        return df_base.collect().pipe(Utility.reduce_memory_usage, "df_train")

In [6]:
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if col.endswith("D"):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.rename(
        {
            "MONTH": "month",
            "WEEK_NUM": "week_num"
        }
    )
            
    df = df.with_columns(
        [
            pl.col("date_decision").dt.year().alias("year").cast(pl.Int16),
            pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8),
        ]
    )

    return df.drop("date_decision")

In [7]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TRAIN_DIR / "train_static_cb_0.parquet"),
        SchemaGen.scan_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_other_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_person_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_deposit_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ],
}

df_train: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_train")
)

del data_store
gc.collect()

print(f"Train data shape: {df_train.shape}")
display(df_train.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_6 loaded into memory.
File train_credit_bureau_a_2_1 loaded into memory.
File train_credit_bureau_a_2_0 loaded into memory.
File train_credit_bureau_a_2_7 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,…,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,max_num_group1_13,max_num_group2_13,max_pmts_date_1107D,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,mean_pmts_date_1107D,mean_pmts_dpdvalue_108P,mean_pmts_pmtsoverdue_635A,var_pmts_date_1107D,var_pmts_dpdvalue_108P,var_pmts_pmtsoverdue_635A,riskassesment_302T_rng,riskassesment_302T_mean,year,day
u32,u32,u8,u8,i16,u8,i16,i16,f32,i32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u8,u8,i16,f32,f32,i16,f32,f32,i32,f32,f32,u8,f32,u16,u8
1416641,201906,24,0,,,,-10613.0,,-10613.0,,1.0,1.0,1.0,3.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",1.0,,,,,,,,,,,,,,,,,4.0,…,"""c7a5ad39""","""a55475b1""",3.0,23.0,,0.0,,12.0,,0.0,,2019.0,"""ab3c25cf""","""a55475b1""",,0.0,,0.0,,0.0,,0.0,,,,,,,,,,,,,,2019,22
1708174,201912,51,0,,,,,,-10160.0,,2.0,2.0,1.0,2.0,2.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,,,,,,,,,,,,,,,,,2.0,…,"""c7a5ad39""","""c7a5ad39""",1.0,35.0,0.0,144.0,12.0,12.0,0.0,10896.366211,2020.0,2018.0,"""ab3c25cf""","""ab3c25cf""",0.0,22.625,0.0,1905.939697,0.0,1994.505493,0.0,12090471.0,,,,,,,,,,,,,,2019,25
970423,202003,62,0,,14.0,,,,-21711.0,,1.0,2.0,1.0,3.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,,,,,,,,,,,,,,,,,6.0,…,"""c7a5ad39""","""c7a5ad39""",1.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2021.0,2014.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,2020,11
689314,201905,18,0,,,,-16022.0,,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,13
955894,202002,58,0,,,,,,-9568.0,,0.0,0.0,0.0,4.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",5.0,,,,,,,,,,,,,,,,,1.0,…,"""c7a5ad39""","""c7a5ad39""",5.0,35.0,0.0,0.0,12.0,12.0,3306.221924,0.0,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,47.91626,0.0,0.0,0.0,158421.796875,0.0,,,,,,,,,,,,,,2020,11
1911603,202008,86,0,,,,,897859.8125,-21488.0,,1.0,1.0,1.0,8.0,1.0,"""2fc785b2""","""a55475b1""","""a55475b1""",10.0,,,,,,,,,,,,,,,,,8.0,…,"""c7a5ad39""","""c7a5ad39""",8.0,35.0,0.0,24.0,12.0,12.0,0.0,3095.506104,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.57265,0.0,58.508923,0.0,10.160625,0.0,164950.53125,,,,,,,,,,,,,,2020,31
1925790,202009,88,0,,,,,242773.984375,-11882.0,,1.0,2.0,0.0,3.0,0.0,"""2fc785b2""","""6b2ae0fa""","""a55475b1""",3.0,,,,,,,,,,,,,,,,,2.0,…,"""c7a5ad39""","""c7a5ad39""",5.0,35.0,1.0,59.0,12.0,12.0,750.194031,10472.0,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.017544,10.83721,13.161299,2661.704834,0.017544,236.537888,9873.527344,8445391.0,,,,,,,,,,,,,,2020,11
1470884,201908,30,0,,,,-22979.0,,-22979.0,,2.0,2.0,2.0,6.0,2.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",6.0,,,,,,,,,,,,,,,,,3.0,…,"""c7a5ad39""","""c7a5ad39""",9.0,35.0,0.0,27.0,12.0,12.0,0.0,250.680008,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.571429,0.0,7.162286,0.0,12.683229,0.0,1769.421021,,,,,,,,,,,,,,2019,1
2582487,201906,22,0,-604.0,,,-21802.0,,-21802.0,,0.0,0.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,,,,,,,,,,,,,,,,,5.0,…,"""c7a5ad39""","""c7a5ad39""",5.0,35.0,0.0,19.0,12.0,12.0,0.0,4690.399902,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",0.0,1.373626,0.0,137.630768,0.0,18.703297,0.0,576478.75,,,,,,,,,,,,,,2019,10
1765805,202001,56,0,,,,,,-14637.0,,1.0,2.0,0.0,2.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",1.0,,,,,,,,,,,,,,,,,1.0,…,"""c7a5ad39""","""c7a5ad39""",5.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,2020,28


In [8]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TEST_DIR / "test_static_cb_0.parquet"),
        SchemaGen.scan_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_other_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_person_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_deposit_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ],
}

df_test: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_test")
)
    

del data_store
gc.collect()

print(f"Test data shape: {df_test.shape}")

File test_base loaded into memory.
File test_static_cb_0 loaded into memory.
File test_static_0_0 loaded into memory.
File test_static_0_2 loaded into memory.
File test_static_0_1 loaded into memory.
File test_applprev_1_2 loaded into memory.
File test_applprev_1_0 loaded into memory.
File test_applprev_1_1 loaded into memory.
File test_tax_registry_a_1 loaded into memory.
File test_tax_registry_b_1 loaded into memory.
File test_tax_registry_c_1 loaded into memory.
File test_credit_bureau_a_1_3 loaded into memory.
File test_credit_bureau_a_1_2 loaded into memory.
File test_credit_bureau_a_1_1 loaded into memory.
File test_credit_bureau_a_1_4 loaded into memory.
File test_credit_bureau_a_1_0 loaded into memory.
File test_credit_bureau_b_1 loaded into memory.
File test_other_1 loaded into memory.
File test_person_1 loaded into memory.
File test_deposit_1 loaded into memory.
File test_debitcard_1 loaded into memory.
File test_credit_bureau_a_2_3 loaded into memory.
File test_credit_bureau

In [9]:
df_train, cat_cols = Utility.to_pandas(df_train)
df_test, cat_cols = Utility.to_pandas(df_test)

In [10]:
class featureEng :
    
    @staticmethod
    def ordinal_encode(df):
        # Initialize the OrdinalEncoder
        ordinal_encoder = OrdinalEncoder()
    
        # Select columns of type object (categorical)
        categorical_columns = df.select_dtypes(include=['object']).columns
    
        # Fit and transform the categorical columns
        df[categorical_columns] = ordinal_encoder.fit_transform(df[categorical_columns])
    
        return df
    
    
    def normalization(df1):
        # Initialiser les scalers pour la normalisation et la standardisation
        scaler1 = MinMaxScaler()
        scaler2 = StandardScaler()
        df = df1.copy()
        # Initialiser une liste pour stocker les colonnes de type "objet"
        obj_cols = []
    
        # Parcourir les colonnes du DataFrame
        for col in df.columns:
            # Vérifier si la colonne n'est pas de type "objet"
            if df[col].dtype != 'object':
                # Effectuer le test de Shapiro-Wilk pour la normalité
                stat, p_value = stats.shapiro(df[col])
                # Si la p-value est supérieure à 0.05, normaliser la colonne
                if p_value > 0.05:
                    df[col] = scaler1.fit_transform(df[[col]])
                # Sinon, standardiser la colonne
                else:
                    df[col] = scaler2.fit_transform(df[[col]])
            else:
                # Stocker les colonnes de type "objet"
                obj_cols.append(col)
    
        # Concaténer les colonnes normalisées, standardisées et les colonnes de type "objet"
        df = pd.concat([df, df[obj_cols]], axis=1)
        df.columns = df1.columns
        del df1
        return df
    def data_imputation(df):
        #for col in df.columns:
            #if df[col].dtype == 'object':
        
        # Imputation
        my_imputer = SimpleImputer(missing_values = NaN)
        imputed_df = pd.DataFrame(my_imputer.fit_transform(df))
        #imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

        # Imputation removed column names; put them back
        imputed_df.columns = df.columns
        #imputed_X_valid.columns = X_valid.columns
        return df
    
    def reduce_mem_usage(df, verbose=True):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024**2
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)

        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

        return df
    
    @staticmethod
    def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
        for col in df.columns:  
            if df[col].dtype.name in ['object', 'string']:
                df[col] = df[col].astype("string").astype('category')
                current_categories = df[col].cat.categories
                new_categories = current_categories.to_list() + ["Unknown"]
                new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
                df[col] = df[col].astype(new_dtype)
        return df

In [11]:
n_rows_percent = int(len(df_train) * 0.01)
df_train = df_train.iloc[:n_rows_percent]
gc.collect()

0

In [12]:
# Supprimer les lignes avec des valeurs nulles dans la colonne cible (target)
df_train = df_train.dropna(subset=['target'])

# Réinitialiser les index après la suppression des lignes
df_train = df_train.reset_index(drop=True)
X = df_train.drop(columns=["target", "case_id", "week_num"])
y = df_train["target"]
y = y.iloc[:n_rows_percent]
df_train = []
del df_train
gc.collect()

0

In [13]:
X = featureEng.ordinal_encode(X)
df_test = featureEng.ordinal_encode(df_test)
X.head()

Unnamed: 0,month,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,...,mean_pmts_date_1107D,mean_pmts_dpdvalue_108P,mean_pmts_pmtsoverdue_635A,var_pmts_date_1107D,var_pmts_dpdvalue_108P,var_pmts_pmtsoverdue_635A,riskassesment_302T_rng,riskassesment_302T_mean,year,day
0,201906,,,,-10613.0,,-10613.0,,1.0,1.0,...,,,,,,,,,2019,22
1,201912,,,,,,-10160.0,,2.0,2.0,...,,,,,,,,,2019,25
2,202003,,14.0,,,,-21711.0,,1.0,2.0,...,,,,,,,,,2020,11
3,201905,,,,-16022.0,,,,,,...,,,,,,,,,2019,13
4,202002,,,,,,-9568.0,,0.0,0.0,...,,,,,,,,,2020,11


In [14]:
X = X.fillna(X.mode().iloc[0])
df_test = df_test.fillna(df_test.mode().iloc[0])

In [15]:
X = featureEng.reduce_mem_usage(X)

Memory usage after optimization is: 30.09 MB
Decreased by 44.5%


In [16]:
df_test = df_test.drop(columns=["case_id", "week_num"])

In [17]:
X = featureEng.normalization(X)
df_test = featureEng.normalization(df_test)
X.head()

Unnamed: 0,month,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,...,mean_pmts_date_1107D,mean_pmts_dpdvalue_108P,mean_pmts_pmtsoverdue_635A,var_pmts_date_1107D,var_pmts_dpdvalue_108P,var_pmts_pmtsoverdue_635A,riskassesment_302T_rng,riskassesment_302T_mean,year,day
0,-0.685584,0.217651,0.034882,-0.181519,1.976562,-0.182862,1.150391,-0.1427,-0.235352,-0.421143,...,-0.113647,-0.032667,-0.032867,,-0.020254,-0.026031,-0.081848,-0.099976,-0.660499,0.695332
1,-0.552082,0.217651,0.034882,-0.181519,-0.65918,-0.182862,1.245117,-0.1427,0.259033,-0.068237,...,-0.113647,-0.032667,-0.032867,,-0.020254,-0.026031,-0.081848,-0.099976,-0.660499,1.039858
2,1.472696,0.217651,0.034882,-0.181519,-0.65918,-0.182862,-1.145508,-0.1427,-0.235352,-0.068237,...,-0.113647,-0.032667,-0.032867,,-0.020254,-0.026031,-0.081848,-0.099976,1.514006,-0.567928
3,-0.707834,0.217651,0.034882,-0.181519,1.003906,-0.182862,0.204956,-0.1427,-0.72998,-0.774414,...,-0.113647,-0.032667,-0.032867,,-0.020254,-0.026031,-0.081848,-0.099976,-0.660499,-0.338244
4,1.450446,0.217651,0.034882,-0.181519,-0.65918,-0.182862,1.367188,-0.1427,-0.72998,-0.774414,...,-0.113647,-0.032667,-0.032867,,-0.020254,-0.026031,-0.081848,-0.099976,1.514006,-0.567928


In [18]:
#from sklearn.ensemble import HistGradientBoostingClassifier

# Entraîner le modèle HistGradientBoostingClassifier
#hgb_classifier = HistGradientBoostingClassifier()
#hgb_classifier.fit(X, y)

# Faire des prédictions
#y_pred = hgb_classifier.predict(df_test)

In [19]:
from sklearn.experimental import enable_hist_gradient_boosting  # Activer l'expérimentation
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Définir les hyperparamètres à rechercher
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_iter': [20, 30, 40],
    'max_leaf_nodes': [31, 63, 127],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [10, 20, 30],
    'l2_regularization': [0.0, 1.0, 10.0],
    'max_bins': [255, 510, 1020]
}

# Initialiser le modèle
hgb_classifier = HistGradientBoostingClassifier()

# Initialiser GridSearchCV
grid_search = GridSearchCV(estimator=hgb_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Exécuter la recherche des hyperparamètres
grid_search.fit(X, y)

# Afficher les meilleurs paramètres
print("Best parameters found: ", grid_search.best_params_)

# Utiliser le meilleur modèle pour faire des prédictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(df_test)

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits
[CV] END l2_regularization=0.0, learning_rate=0.05, max_bins=255, max_depth=3, max_iter=20, max_leaf_nodes=31, min_samples_leaf=10; total time=   3.3s
[CV] END l2_regularization=0.0, learning_rate=0.05, max_bins=255, max_depth=3, max_iter=20, max_leaf_nodes=31, min_samples_leaf=20; total time=   3.2s
[CV] END l2_regularization=0.0, learning_rate=0.05, max_bins=255, max_depth=3, max_iter=20, max_leaf_nodes=31, min_samples_leaf=30; total time=   3.3s
[CV] END l2_regularization=0.0, learning_rate=0.05, max_bins=255, max_depth=3, max_iter=20, max_leaf_nodes=63, min_samples_leaf=10; total time=   3.2s
[CV] END l2_regularization=0.0, learning_rate=0.05, max_bins=255, max_depth=3, max_iter=20, max_leaf_nodes=63, min_samples_leaf=10; total time=   3.0s
[CV] END l2_regularization=0.0, learning_rate=0.05, max_bins=255, max_depth=3, max_iter=20, max_leaf_nodes=63, min_samples_leaf=20; total time=   3.0s
[CV] END l2_regularization=0

In [20]:
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred

print("Check null: ", df_subm["score"].isnull().any())
df_subm.head()

Check null:  False


Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0
57549,0
57551,0
57552,0
57569,1


In [21]:
df_subm.to_csv("submission.csv")