In [1]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

dataPath = "/kaggle/input/contest-data/parquet_files/"

In [2]:
def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("M",):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))
    return df

In [3]:
train_basetable_full = pl.read_parquet(dataPath + "train_base_table.parquet").pipe(set_table_dtypes)

train_basetable = train_basetable_full.drop('target')
train_target = train_basetable_full['target']

del(train_basetable_full)

In [4]:
train_static_0_0 = pl.read_parquet(dataPath + "train/train_static_0_0.parquet").pipe(set_table_dtypes)
train_static_0_1 = pl.read_parquet(dataPath + "train/train_static_0_1.parquet").pipe(set_table_dtypes)

train_static_0 = pl.concat(
    [
        train_static_0_0,
        train_static_0_1
    ],
    how="vertical_relaxed",
)

del(train_static_0_0)
del(train_static_0_1)

In [5]:
train_static_cb_0 = pl.read_parquet(dataPath + "train/train_static_cb_0.parquet").pipe(set_table_dtypes)

In [6]:
test_basetable = pl.read_parquet(dataPath + "test_base_table.parquet").pipe(set_table_dtypes)

In [7]:
train = train_basetable.join(
    train_static_0, how="left", on="case_id"
).join(
    train_static_cb_0, how="left", on="case_id"
)

test = test_basetable.join(
    train_static_0, how="left", on="case_id"
).join(
    train_static_cb_0, how="left", on="case_id"
)

del(train_static_0)
del(train_static_cb_0)

del(train_basetable)
del(test_basetable)

In [8]:
def handle_dates(df):
    for col in df.columns:
        if col[-1] in ("D",):
            df = df.with_columns( (pl.col(col) - pl.col("date_decision")).cast(pl.Int64) )
    df = df.drop("date_decision")
    return df

### Первый бейзлайн побит путем обучения модели на данных глубины 0 (я шел последовательно, потом удалил ячейки). Теперь добавим глубину 1.

In [9]:
def num_expr(df):
    cols = [col for col in df.columns if col[-1] in ("P", "A")]
    expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
    expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
    expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
    expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
    return expr_min + expr_max + expr_last + expr_mean
    
def date_expr(df):
    cols = [col for col in df.columns if col[-1] in ("D")]
    expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
    expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
    expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
    return expr_min + expr_max + expr_last
    
def str_expr(df):
    cols = [col for col in df.columns if col[-1] in ("M","T","L")]
    expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
    expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
    expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
    expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
    return expr_min + expr_max + expr_last + expr_count
    
def count_expr(df):
    cols = [col for col in df.columns if "num_group" in col]
    expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
    expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
    return expr_last + expr_count
    
def get_exprs(df):
    exprs = num_expr(df) + \
            date_expr(df) + \
            str_expr(df) + \
            count_expr(df)

    return exprs

In [10]:
counter = 0

In [11]:
train_applprev_1_0 = pl.read_parquet(dataPath + "train/train_applprev_1_0.parquet").pipe(set_table_dtypes)
train_applprev_1_1 = pl.read_parquet(dataPath + "train/train_applprev_1_1.parquet").pipe(set_table_dtypes)
train_applprev_1 = pl.concat(
    [
        train_applprev_1_0,
        train_applprev_1_1
    ],
    how="vertical_relaxed",
)

del(train_applprev_1_0)
del(train_applprev_1_1)

train_applprev_1_new = train_applprev_1.group_by("case_id").agg(get_exprs(train_applprev_1))

del(train_applprev_1)

counter += 1

train = train.join(
    train_applprev_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_applprev_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_applprev_1_new)

In [12]:
train_other_1 = pl.read_parquet(dataPath + "train/train_other_1.parquet").pipe(set_table_dtypes)

train_other_1_new = train_other_1.group_by("case_id").agg(get_exprs(train_other_1))

del(train_other_1)

counter += 1

train = train.join(
    train_other_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_other_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_other_1_new)

In [13]:
train_tax_registry_a_1 = pl.read_parquet(dataPath + "train/train_tax_registry_a_1.parquet").pipe(set_table_dtypes)

train_tax_registry_a_1_new = train_tax_registry_a_1.group_by("case_id").agg(get_exprs(train_tax_registry_a_1))

del(train_tax_registry_a_1)

counter += 1

train = train.join(
    train_tax_registry_a_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_tax_registry_a_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_tax_registry_a_1_new)

In [14]:
train_tax_registry_b_1 = pl.read_parquet(dataPath + "train/train_tax_registry_b_1.parquet").pipe(set_table_dtypes)

train_tax_registry_b_1_new = train_tax_registry_b_1.group_by("case_id").agg(get_exprs(train_tax_registry_b_1))

del(train_tax_registry_b_1)

counter += 1

train = train.join(
    train_tax_registry_b_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_tax_registry_b_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_tax_registry_b_1_new)

In [15]:
train_tax_registry_c_1 = pl.read_parquet(dataPath + "train/train_tax_registry_c_1.parquet").pipe(set_table_dtypes)

train_tax_registry_c_1_new = train_tax_registry_c_1.group_by("case_id").agg(get_exprs(train_tax_registry_c_1))

del(train_tax_registry_c_1)

counter += 1

train = train.join(
    train_tax_registry_c_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_tax_registry_c_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_tax_registry_c_1_new)

In [16]:
train_credit_bureau_a_1_0 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_1_0.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_1_1 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_1_1.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_1_2 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_1_2.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_1_3 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_1_3.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_1 = pl.concat(
    [
        train_credit_bureau_a_1_0,
        train_credit_bureau_a_1_1,
        train_credit_bureau_a_1_2,
        train_credit_bureau_a_1_3
    ],
    how="vertical_relaxed",
)

del(train_credit_bureau_a_1_0)
del(train_credit_bureau_a_1_1)
del(train_credit_bureau_a_1_2)
del(train_credit_bureau_a_1_3)

train_credit_bureau_a_1_new = train_credit_bureau_a_1.group_by("case_id").agg(get_exprs(train_credit_bureau_a_1))

del(train_credit_bureau_a_1)

counter += 1

train = train.join(
    train_credit_bureau_a_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_credit_bureau_a_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_credit_bureau_a_1_new)

In [17]:
train_credit_bureau_b_1 = pl.read_parquet(dataPath + "train/train_credit_bureau_b_1.parquet").pipe(set_table_dtypes)

train_credit_bureau_b_1_new = train_credit_bureau_b_1.group_by("case_id").agg(get_exprs(train_credit_bureau_b_1))

del(train_credit_bureau_b_1)

counter += 1

train = train.join(
    train_credit_bureau_b_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_credit_bureau_b_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_credit_bureau_b_1_new)

In [18]:
train_deposit_1 = pl.read_parquet(dataPath + "train/train_deposit_1.parquet").pipe(set_table_dtypes)

train_deposit_1_new = train_deposit_1.group_by("case_id").agg(get_exprs(train_deposit_1))

del(train_deposit_1)

counter += 1

train = train.join(
    train_deposit_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_deposit_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_deposit_1_new)

In [19]:
train_person_1 = pl.read_parquet(dataPath + "train/train_person_1.parquet").pipe(set_table_dtypes)

train_person_1_new = train_person_1.group_by("case_id").agg(get_exprs(train_person_1))

del(train_person_1)

counter += 1

train = train.join(
    train_person_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_person_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_person_1_new)

In [20]:
train_debitcard_1 = pl.read_parquet(dataPath + "train/train_debitcard_1.parquet").pipe(set_table_dtypes)

train_debitcard_1_new = train_debitcard_1.group_by("case_id").agg(get_exprs(train_debitcard_1))

del(train_debitcard_1)

counter += 1

train = train.join(
    train_debitcard_1_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_debitcard_1_new, how="left", on="case_id", suffix = str(counter)
)

del(train_debitcard_1_new)

In [21]:
train_applprev_2 = pl.read_parquet(dataPath + "train/train_applprev_2.parquet").pipe(set_table_dtypes)

train_applprev_2_new = train_applprev_2.group_by("case_id").agg(get_exprs(train_applprev_2))

del(train_applprev_2)

counter += 1

train = train.join(
    train_applprev_2_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_applprev_2_new, how="left", on="case_id", suffix = str(counter)
)

del(train_applprev_2_new)

In [22]:
train_person_2 = pl.read_parquet(dataPath + "train/train_person_2.parquet").pipe(set_table_dtypes)

train_person_2_new = train_person_2.group_by("case_id").agg(get_exprs(train_person_2))

del(train_person_2)

counter += 1

train = train.join(
    train_person_2_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_person_2_new, how="left", on="case_id", suffix = str(counter)
)

del(train_person_2_new)

In [None]:
"""train_credit_bureau_a_2_0 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_0.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_1 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_1.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_2 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_2.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_3 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_3.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_4 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_4.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_5 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_5.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_6 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_6.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_7 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_7.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_8 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_8.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_9 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_9.parquet").pipe(set_table_dtypes)
train_credit_bureau_a_2_10 = pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_10.parquet").pipe(set_table_dtypes) """
train_credit_bureau_a_2 = pl.concat(
    [
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_0.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_1.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_2.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_3.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_4.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_5.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_6.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_7.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_8.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_9.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train/train_credit_bureau_a_2_10.parquet").pipe(set_table_dtypes)
    ],
    how="vertical_relaxed",
)

"""del(train_credit_bureau_a_2_0)
del(train_credit_bureau_a_2_1)
del(train_credit_bureau_a_2_2)
del(train_credit_bureau_a_2_3)
del(train_credit_bureau_a_2_4)
del(train_credit_bureau_a_2_5)
del(train_credit_bureau_a_2_6)
del(train_credit_bureau_a_2_7)
del(train_credit_bureau_a_2_8)
del(train_credit_bureau_a_2_9)
del(train_credit_bureau_a_2_10) """

train_credit_bureau_a_2_new = train_credit_bureau_a_2.group_by("case_id").agg(get_exprs(train_credit_bureau_a_2))

del(train_credit_bureau_a_2)

counter += 1

train = train.join(
    train_credit_bureau_a_2_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_credit_bureau_a_2_new, how="left", on="case_id", suffix = str(counter)
)

del(train_credit_bureau_a_2_new)

In [None]:
train_credit_bureau_b_2 = pl.read_parquet(dataPath + "train/train_credit_bureau_b_2.parquet").pipe(set_table_dtypes)

train_credit_bureau_b_2_new = train_credit_bureau_b_2.group_by("case_id").agg(get_exprs(train_credit_bureau_b_2))

del(train_credit_bureau_b_2)

counter += 1

train = train.join(
    train_credit_bureau_b_2_new, how="left", on="case_id", suffix = str(counter)
)

test = test.join(
    train_credit_bureau_b_2_new, how="left", on="case_id", suffix = str(counter)
)

del(train_credit_bureau_b_2_new)

In [None]:
train = handle_dates(train)
test = handle_dates(test)

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(train, train_target, test_size = 0.25, random_state = 42)

In [None]:
del(train)

In [None]:
X_tr_pd = X_tr.to_pandas()
del(X_tr)

X_val_pd = X_val.to_pandas()
del(X_val)

y_tr_pd = y_tr.to_pandas()
del(y_tr)

y_val_pd = y_val.to_pandas()
del(y_val)

In [None]:
X_tr_pd.dtypes.unique()

In [None]:
for col in X_tr_pd.loc[:, (X_tr_pd.dtypes == 'object')].columns:
    X_tr_pd[col] = X_tr_pd[col].astype('category')
for col in X_val_pd.loc[:, (X_val_pd.dtypes == 'object')].columns:
    X_val_pd[col] = X_val_pd[col].astype('category')

for col in X_tr_pd.loc[:, (X_tr_pd.dtypes == 'uint32')].columns:
    X_tr_pd[col] = X_tr_pd[col].astype('int64')
for col in X_val_pd.loc[:, (X_val_pd.dtypes == 'uint32')].columns:
    X_val_pd[col] = X_val_pd[col].astype('int64')

In [None]:
lgb_tr = lgb.Dataset(X_tr_pd, label=y_tr_pd, free_raw_data=False)

del(X_tr_pd)
del(y_tr_pd)

lgb_val = lgb.Dataset(X_val_pd, label=y_val_pd, free_raw_data=False)

del(X_val_pd)
del(y_val_pd)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'eta': 0.05,
    #'lambda': 1e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

model = lgb.train(
    params, lgb_tr, num_boost_round=500,
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10, min_delta=0.),
        lgb.log_evaluation(period=20) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

In [None]:
test_pd = test.to_pandas()

del(test)

In [None]:
for col in test_pd.loc[:, (test_pd.dtypes == 'object')].columns:
    test_pd[col] = test_pd[col].astype('category')
for col in test_pd.loc[:, (test_pd.dtypes == 'uint32')].columns:
    test_pd[col] = test_pd[col].astype('int64')

In [None]:
y_submission_pred = model.predict(test_pd, num_iteration=model.best_iteration)
submission = pd.DataFrame({
    "case_id": test_pd["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission_late_1.csv")