In [None]:
import polars as pl
import polars.selectors as cs

In [None]:
dir = '/kaggle/input/iatvoirotblat/'
year, month, day = 2023, 10, 1

In [None]:
import holidays


def calendar_preprocess(dir):
    calendar = pl.scan_csv(dir + "calendar_extended.csv")
    ua_holidays = holidays.RU(years=[2022, 2023, 2024, 2025, 2026])

    calendar = calendar.with_columns(
        pl.col("date").str.strptime(pl.Date, format="%Y-%m-%d").alias("date_dt"),
        pl.col("date").map_elements(lambda x: int(x in ua_holidays)).alias("is_holiday"))
    calendar = calendar.with_columns(
        pl.col("date_dt").dt.weekday().is_in([6, 7]).cast(pl.Int8).alias("is_weekend"),
        pl.col("date_dt").dt.weekday().alias("week_day")
    )



    calendar = calendar.with_columns(
        # pl.col("date_dt").dt.day().alias("day_of_month"),
        pl.col("date_dt").dt.month().alias("month"),
        pl.col("date_dt").dt.year().alias("year"),
        ((pl.col("date_dt").dt.day() - 1) // 7 + 1).alias("week_of_month"),
        pl.col("is_holiday").sum().over("week").alias("holidays_in_week"),
        pl.col("is_weekend").sum().over("week").alias("weekends_in_week"),
        # pl.col("date_dt").dt.ordinal_day().alias("day_of_year"),
        pl.col("date_dt").dt.quarter().alias("quarter")
    )

    calendar = calendar.drop(["part", "is_holiday", "is_weekend", 'week_day']).with_columns(
        pl.col("week").cast(pl.Int16),
        pl.col("month").cast(pl.Int8),
        pl.col("week_of_month").cast(pl.Int8),
        pl.col("holidays_in_week").cast(pl.Int8),
        pl.col("quarter").cast(pl.Int8)
    )

    calendar = calendar.group_by("week").agg(
                pl.col("month").mode().first().alias("month"),
                pl.col("year").mode().first().alias("year"),
                pl.col("week_of_month").mode().first().alias("week_of_month"),
                pl.col("quarter").mode().first().alias("quarter"),

                pl.col("holidays_in_week").first(),
                pl.col("weekends_in_week").first()
            )
    return calendar


In [None]:
def prepare_train(dir, calendar):
    inns = pl.read_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64})['inn_id'].unique()
    train = pl.scan_parquet('/kaggle/input/iatvoirotblat/target_series (1).parquet'
    ).with_columns(pl.col("week").cast(pl.Int16)
    ).filter(pl.col('inn_id').is_in(inns))
    train = train.join(calendar, on='week', how='left').unique()

    return train.filter((pl.col('year') >= year) & (pl.col('month') >= month) & (pl.col('week_of_month') >= day // 7 + 1))

In [None]:
def prepare_test(dir, calendar):
    test = pl.scan_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64}).with_columns(pl.col("week").cast(pl.Int16))
    test = test.join(calendar, on='week', how='left').unique()
    return test

In [None]:
def prepare_val(dir, calendar):
    inns = pl.read_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64})['inn_id'].unique()
    val = pl.scan_parquet('/kaggle/input/iatvoirotblat/target_series_extended.parquet'
    ).with_columns(pl.col("week").cast(pl.Int16)
    ).filter(pl.col('inn_id').is_in(inns))
    val = val.join(calendar, on='week', how='left').unique()

    return val.filter((pl.col('year') >= year) & (pl.col('month') >= month) & (pl.col('week_of_month') >= day // 7 + 1))

In [None]:
def prepare_all(dir, calendar):
    inns = pl.read_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64})['inn_id'].unique()
    full = pl.concat([pl.scan_parquet('/kaggle/input/iatvoirotblat/target_series_extended.parquet'), pl.scan_parquet('/kaggle/input/iatvoirotblat/target_series (1).parquet')]
                    ).with_columns(pl.col("week").cast(pl.Int16)
                    ).filter(pl.col('inn_id').is_in(inns))
    full = full.join(calendar, on='week', how='left').unique()
    full = full.join(pl.scan_parquet('/kaggle/input/iatvoirotblat/innweekstartsum.parquet').with_columns(pl.col("week").cast(pl.Int16)), on=['week', 'inn_id'], how='left')
    return full.filter((pl.col('year') >= year) & (pl.col('month') >= month) & (pl.col('week_of_month') >= day // 7 + 1))

In [None]:

import numpy as np

def pinf(dir, train):
    profiles = pl.scan_parquet(dir + "profiles_extended.parquet")
    age={"1m":1, "2_3m":3, "3_6m":6, "6_12m":12, "1_2y":24, "2_4y":48, "4_8y":96, "8_12y":144, "more_12y":192}
    ipul={"ip":0,"ul":1,}
    profiles = profiles.with_columns(
        pl.col("report_date").str.to_date(),
        pl.col("diff_datopen_report_date_flg").replace(age, default=np.NAN),
        pl.col("ipul").replace(ipul).cast(pl.Int8),
        pl.col("id_region").fill_null("-"),
        pl.col("main_okved_group").fill_null("-"),
        pl.col("inn_id").str.replace("inn","").cast(pl.Int32).alias("inn"),
    )
    profiles = profiles.sort("report_date").group_by("inn_id"
                                        ).agg(pl.col("ipul").first(),
                                            pl.col("id_region").first().alias("id_region_f"),
                                              pl.col("id_region").last().alias('id_region_l'),
                                              pl.col("id_region").n_unique().alias('id_region_u'),
                                            pl.col("main_okved_group").last(),
                                            pl.col("diff_datopen_report_date_flg").max(),
                                            pl.col("diff_datopen_report_date_flg").min().alias("diff_datopen_report_date_flg_min"),
                                           ).join(profiles.group_by("inn_id"
                                                                   ).agg(
                                            pl.col("report_date").min().alias("report_date_min"),
                                                                         ),
                                                  on="inn_id",
                                                  how="left")
    train = train.join(profiles, on='inn_id', how='left')
    return train

In [None]:
def groupper(out_vtb_full, groups, postfix, mode):
    groupped = out_vtb_full.group_by(groups).agg(
        pl.col("trns_count").sum().alias(f"{mode}_vtb_trns_count_{postfix}"),
        pl.col("trns_amount").sum().alias(f"{mode}_vtb_trns_amount_{postfix}"),
        (pl.col("trns_amount").sum() / pl.col("trns_count").sum()).alias(f"{mode}_vtb_avg_amount_{postfix}"),

        pl.col("trns_amount").mean().alias(f"{mode}_avg_amount_{postfix}"),
        pl.col("trns_amount").std().alias(f"{mode}_std_amount_{postfix}"),
        pl.col("trns_amount").median().alias(f"{mode}_median_amount_{postfix}"),
        pl.col("trns_amount").quantile(0.9).alias(f"{mode}_p90_amount_{postfix}"),
        pl.col("trns_amount").quantile(0.75).alias(f"{mode}_p75_amount_{postfix}"),
        pl.col("trns_amount").quantile(0.5).alias(f"{mode}_p5_amount_{postfix}"),
        pl.col("trns_amount").quantile(0.25).alias(f"{mode}_p25_amount_{postfix}"),
        pl.col("trns_amount").quantile(0.1).alias(f"{mode}_p10_amount_{postfix}"),

        pl.col("date").n_unique().alias(f"{mode}_active_days_{postfix}"),

        pl.col("trns_class_encoded").n_unique().alias(f"{mode}_trns_types_{postfix}"),
        pl.col("trns_class_encoded").mode().first().alias(f"{mode}_common_trns_type_{postfix}"),

        (pl.col("trns_amount") > 500_000).sum().alias(f"{mode}_large_trns_count_{postfix}"),
        (pl.col("trns_amount") < 2_000).sum().alias(f"{mode}_small_trns_count_{postfix}"),

        # pl.col("doc_payee_inn").n_unique().cast(pl.UInt16).alias(f"out_vtb_cinn_{postfix}"),
        pl.col("trns_class_encoded").n_unique().alias(f"{mode}_vtb_trns_class_encoded_{postfix}"),
        # pl.col("doc_payee_bank_name_encoded").n_unique().alias(f"{mode}_vtb_doc_payer_bank_name_encoded_{postfix}"),
        pl.col("date").n_unique().cast(pl.UInt8).alias(f"{mode}_vtb_date_{postfix}"),
    )
    return groupped

In [None]:
def out_groupper(dir, train, calendar):
    inns = pl.read_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64})['inn_id'].unique()
    out_vtb_full=pl.scan_parquet(f"{dir}transactions_?.parquet").filter(pl.col("doc_payer_inn").is_in(inns)
                                                ).filter(pl.col("doc_payer_bank_name_flag")==1
                                                ).with_columns(pl.col("date").dt.date()
                                                )#.filter(pl.col("day") >= date)
    out_vtb_full=out_vtb_full.join(pl.scan_csv(dir + "calendar_extended.csv").with_columns(pl.col("week").cast(pl.Int16), pl.col("date").str.strptime(pl.Date, format="%Y-%m-%d")),
                            left_on=["date"],
                            right_on=["date"],
                            how="left" )

    out_vtb_full=out_vtb_full.join(calendar,
                        left_on=["week"],
                        right_on=["week"],
                        how="left" )


    # out_inn_week = groupper(out_vtb_full, ["doc_payer_inn", "week"], 'week', 'out')
    # out_week = groupper(out_vtb_full, ["week"], 'week_full', 'out')
    # out_full_week = out_inn_week.join(out_week, on=['week'], how='left').sort('week')

    # cols = ['out_vtb_trns_count_week','out_vtb_trns_amount_week','out_vtb_avg_amount_week','out_avg_amount_week','out_std_amount_week','out_median_amount_week','out_p90_amount_week','out_p75_amount_week','out_p5_amount_week','out_p25_amount_week','out_p10_amount_week',
    #         'out_active_days_week','out_trns_types_week','out_common_trns_type_week','out_large_trns_count_week','out_small_trns_count_week','out_vtb_trns_class_encoded_week','out_vtb_date_week','out_vtb_trns_count_week_full','out_vtb_trns_amount_week_full','out_vtb_avg_amount_week_full',
    #         'out_avg_amount_week_full','out_std_amount_week_full','out_median_amount_week_full','out_p90_amount_week_full','out_p75_amount_week_full','out_p5_amount_week_full','out_p25_amount_week_full','out_p10_amount_week_full','out_active_days_week_full',
    #         'out_trns_types_week_full','out_common_trns_type_week_full','out_large_trns_count_week_full','out_small_trns_count_week_full','out_vtb_trns_class_encoded_week_full','out_vtb_date_week_full']
    # out_full_week = out_full_week.with_columns(cs.by_name(cols).shift(1).over("doc_payer_inn").name.suffix("_1"),
    #                                            cs.by_name(cols).shift(2).over("doc_payer_inn").name.suffix("_2"),
    #                                            cs.by_name(cols).shift(3).over("doc_payer_inn").name.suffix("_3"),
    #                                            cs.by_name(cols).shift(4).over("doc_payer_inn").name.suffix("_4"),
    #                                            cs.by_name(cols).shift(5).over("doc_payer_inn").name.suffix("_5"),
    #                                            cs.by_name(cols).shift(6).over("doc_payer_inn").name.suffix("_6"),
    #                                            cs.by_name(cols).shift(7).over("doc_payer_inn").name.suffix("_7"),
    #                                            cs.by_name(cols).shift(8).over("doc_payer_inn").name.suffix("_8"),
    #                                            cs.by_name(cols).shift(9).over("doc_payer_inn").name.suffix("_9"),
    #                                            cs.by_name(cols).shift(10).over("doc_payer_inn").name.suffix("_10"),
    #                                            cs.by_name(cols).shift(11).over("doc_payer_inn").name.suffix("_11"),
    #                                            cs.by_name(cols).shift(12).over("doc_payer_inn").name.suffix("_12"),
    #                                            cs.by_name(cols).shift(13).over("doc_payer_inn").name.suffix("_13"),
    #                                            cs.by_name(cols).shift(14).over("doc_payer_inn").name.suffix("_14"),
    #                                            cs.by_name(cols).shift(15).over("doc_payer_inn").name.suffix("_15"),
    #                                            cs.by_name(cols).shift(16).over("doc_payer_inn").name.suffix("_16"),
    #                                            cs.by_name(cols).shift(17).over("doc_payer_inn").name.suffix("_17"),
    #                                            cs.by_name(cols).shift(18).over("doc_payer_inn").name.suffix("_18"),
    #                                            cs.by_name(cols).shift(19).over("doc_payer_inn").name.suffix("_19"),
    #                                            cs.by_name(cols).shift(20).over("doc_payer_inn").name.suffix("_20"),
    #                                             )
    # out_full_week = out_full_week.drop(cols)
    # train = train.join(out_full_week, left_on=['inn_id', 'week'], right_on=['doc_payer_inn', 'week'], how='left')

    out_inn_week_of_month = groupper(out_vtb_full, ["doc_payer_inn", "week_of_month"], 'week_of_month', 'out')
    out_week_of_month = groupper(out_vtb_full, ["week_of_month"], 'week_of_month_full', 'out')
    out_full_wom = out_inn_week_of_month.join(out_week_of_month, on=['week_of_month'], how='left')
    # out_full_wom = out_inn_week_of_month
    train = train.join(out_full_wom, left_on=['inn_id', 'week_of_month'], right_on=['doc_payer_inn', 'week_of_month'], how='left')

    out_inn_month = groupper(out_vtb_full, ["doc_payer_inn", "month"], 'month', 'out')
    out_month = groupper(out_vtb_full, ["month"], 'month_full', 'out')
    out_full_month = out_inn_month.join(out_month, on=['month'], how='left')
    train = train.join(out_full_month, left_on=['inn_id', 'month'], right_on=['doc_payer_inn', 'month'], how='left')

    out_inn_quarter = groupper(out_vtb_full, ["doc_payer_inn", "quarter"], 'quarter', 'out')
    out_quarter = groupper(out_vtb_full, ["quarter"], 'quarter_full', 'out')
    out_full_quarter = out_inn_quarter.join(out_quarter, on=['quarter'], how='left')
    train = train.join(out_full_quarter, left_on=['inn_id', 'quarter'], right_on=['doc_payer_inn', 'quarter'], how='left')

    return train

In [None]:
def in_groupper(dir, train, calendar):
    inns = pl.read_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64})['inn_id'].unique()
    out_vtb_full=pl.scan_parquet(f"{dir}transactions_?.parquet").filter(pl.col("doc_payee_inn").is_in(inns)
                                                ).filter(pl.col("doc_payee_bank_name_flag")==1
                                                ).with_columns(pl.col("date").dt.date()
                                                )#.filter(pl.col("day") >= date)
    out_vtb_full=out_vtb_full.join(pl.scan_csv(dir + "calendar_extended.csv").with_columns(pl.col("week").cast(pl.Int16), pl.col("date").str.strptime(pl.Date, format="%Y-%m-%d")),
                            left_on=["date"],
                            right_on=["date"],
                            how="left" )

    out_vtb_full=out_vtb_full.join(calendar,
                        left_on=["week"],
                        right_on=["week"],
                        how="left" )

    # out_inn_week = groupper(["doc_payer_inn", "week"], 'week')
    # out_week = groupper(["week"], 'week_full')
    # out_full_week = out_inn_week.join(out_week, on=['week'], how='left')
    # train = tain.join(out_full_week, on_left=[])

    out_inn_week_of_month = groupper(out_vtb_full, ["doc_payee_inn", "week_of_month"], 'week_of_month', 'in')
    out_week_of_month = groupper(out_vtb_full, ["week_of_month"], 'week_of_month_full', 'in')
    out_full_wom = out_inn_week_of_month.join(out_week_of_month, on=['week_of_month'], how='left')
    # out_full_wom = out_inn_week_of_month
    train = train.join(out_full_wom, left_on=['inn_id', 'week_of_month'], right_on=['doc_payee_inn', 'week_of_month'], how='left')

    out_inn_month = groupper(out_vtb_full, ["doc_payee_inn", "month"], 'month', 'in')
    out_month = groupper(out_vtb_full, ["month"], 'month_full', 'in')
    out_full_month = out_inn_month.join(out_month, on=['month'], how='left')
    train = train.join(out_full_month, left_on=['inn_id', 'month'], right_on=['doc_payee_inn', 'month'], how='left')

    out_inn_quarter = groupper(out_vtb_full, ["doc_payee_inn", "quarter"], 'quarter', 'in')
    out_quarter = groupper(out_vtb_full, ["quarter"], 'quarter_full', 'in')
    out_full_quarter = out_inn_quarter.join(out_quarter, on=['quarter'], how='left')
    train = train.join(out_full_quarter, left_on=['inn_id', 'quarter'], right_on=['doc_payee_inn', 'quarter'], how='left')

    return train

In [None]:
def groupper2(train, groups, mode, postfix):
    groupped = out_vtb_full.group_by(groups).agg(
        pl.col("target").mean().alias(f"{mode}_avg_amount_{postfix}"),
        pl.col("target").std().alias(f"{mode}_std_amount_{postfix}"),
        pl.col("target").median().alias(f"{mode}_median_amount_{postfix}"),
        pl.col("target").quantile(0.9).alias(f"{mode}_p90_amount_{postfix}"),
        pl.col("target").quantile(0.75).alias(f"{mode}_p75_amount_{postfix}"),
        pl.col("target").quantile(0.5).alias(f"{mode}_p5_amount_{postfix}"),
        pl.col("target").quantile(0.25).alias(f"{mode}_p25_amount_{postfix}"),
        pl.col("target").quantile(0.1).alias(f"{mode}_p10_amount_{postfix}"),

        (pl.col("target") > 500_000).sum().alias(f"{mode}_large_trns_count_{postfix}"),
        (pl.col("target") < 2_000).sum().alias(f"{mode}_small_trns_count_{postfix}"),
        (pl.col("target") > 500_000).mean().alias(f"{mode}_large_trns_count_{postfix}"),
        (pl.col("target") < 2_000).mean().alias(f"{mode}_small_trns_count_{postfix}"),


        pl.col("cumsum").mean().alias(f"{mode}_avg_amount_{postfix}"),
        pl.col("cumsum").std().alias(f"{mode}_std_amount_{postfix}"),
        pl.col("cumsum").median().alias(f"{mode}_median_amount_{postfix}"),
        pl.col("cumsum").quantile(0.9).alias(f"{mode}_p90_amount_{postfix}"),
        pl.col("cumsum").quantile(0.75).alias(f"{mode}_p75_amount_{postfix}"),
        pl.col("cumsum").quantile(0.5).alias(f"{mode}_p5_amount_{postfix}"),
        pl.col("cumsum").quantile(0.25).alias(f"{mode}_p25_amount_{postfix}"),
        pl.col("cumsum").quantile(0.1).alias(f"{mode}_p10_amount_{postfix}"),

        (pl.col("cumsum") > 500_000).sum().alias(f"{mode}_large_trns_count_{postfix}"),
        (pl.col("cumsum") < 2_000).sum().alias(f"{mode}_small_trns_count_{postfix}"),
        (pl.col("cumsum") > 500_000).mean().alias(f"{mode}_large_trns_count_{postfix}"),
        (pl.col("cumsum") < 2_000).mean().alias(f"{mode}_small_trns_count_{postfix}"),
        )
    return groupped

In [None]:
def target_cumsum_grouper(dir, train):
    out_inn_week_of_month = groupper(out_vtb_full, ["doc_payee_inn", "week_of_month"], 'week_of_month', 'in')
    out_week_of_month = groupper(out_vtb_full, ["week_of_month"], 'week_of_month_full', 'in')
    out_full_wom = out_inn_week_of_month.join(out_week_of_month, on=['week_of_month'], how='left')
    # out_full_wom = out_inn_week_of_month
    train = train.join(out_full_wom, left_on=['inn_id', 'week_of_month'], right_on=['doc_payee_inn', 'week_of_month'], how='left')


In [None]:
def get_train(dir):
    calendar = calendar_preprocess(dir).with_columns(pl.col("week").cast(pl.Int16))
    train = prepare_train(dir, calendar)
    train = pinf(dir, train)
    train = out_groupper(dir, train, calendar)
    train = in_groupper(dir, train, calendar).drop('report_date_min')
    train = train.collect()
    for col in train.columns: train = train.with_columns(pl.col(col).shrink_dtype())
    train = train.shrink_to_fit()
    return train

In [None]:
def get_val(dir):
    calendar = calendar_preprocess(dir).with_columns(pl.col("week").cast(pl.Int16))
    train = prepare_val(dir, calendar)
    train = pinf(dir, train)
    train = out_groupper(dir, train, calendar)
    train = in_groupper(dir, train, calendar).drop('report_date_min')
    train = train.collect()
    for col in train.columns: train = train.with_columns(pl.col(col).shrink_dtype())
    train = train.shrink_to_fit()
    return train

In [None]:
def get_test(dir):
    calendar = calendar_preprocess(dir).with_columns(pl.col("week").cast(pl.Int16))
    train = prepare_test(dir, calendar)
    train = pinf(dir, train)
    train = out_groupper(dir, train, calendar)
    train = in_groupper(dir, train, calendar).drop('report_date_min')
    train = train.collect()
    for col in train.columns: train = train.with_columns(pl.col(col).shrink_dtype())
    train = train.shrink_to_fit()
    return train

In [None]:
def get_all(dir):
    calendar = calendar_preprocess(dir).with_columns(pl.col("week").cast(pl.Int16))
    train = prepare_all(dir, calendar)
    train = pinf(dir, train).drop('report_date_min')
    # train = out_groupper(dir, train, calendar)
    # train = in_groupper(dir, train, calendar).drop('report_date_min')
    train = train.collect()
    # for col in train.columns: train = train.with_columns(pl.col(col).shrink_dtype())
    # train = train.shrink_to_fit()
    return train

In [None]:
dir = '/kaggle/input/iatvoirotblat/'
year, month, day = 2023, 10, 1

In [None]:
train = get_all(dir)
train

  pl.col("diff_datopen_report_date_flg").replace(age, default=np.NAN),


inn_id,week,target,month,year,week_of_month,quarter,holidays_in_week,weekends_in_week,cumsum,ipul,id_region_f,id_region_l,id_region_u,main_okved_group,diff_datopen_report_date_flg,diff_datopen_report_date_flg_min
str,i16,f64,i8,i32,i8,i8,i8,i64,f32,i8,str,str,u32,str,f64,f64
"""inn3991673""",73,1.2103e6,12,2023,3,4,0,2,1.12207432e8,1,"""40""","""40""",1,"""46""",192.0,192.0
"""inn3504856""",116,9.0018e6,10,2024,3,4,0,2,0.0,1,"""36""","""36""",1,"""10""",192.0,192.0
"""inn1675574""",116,1.2502e6,10,2024,3,4,0,2,8.0138944e7,1,"""40""","""40""",1,"""47""",192.0,192.0
"""inn3348229""",67,1.2196e6,11,2023,2,4,1,2,2.2323248e7,1,"""46""","""46""",1,"""22""",192.0,192.0
"""inn3995870""",74,220242.475586,12,2023,4,4,0,2,1.339508e7,1,"""20""","""20""",1,"""28""",24.0,3.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""inn2564856""",67,1.4735e7,11,2023,2,4,1,2,4.91811296e8,1,"""45""","""45""",1,"""58""",192.0,192.0
"""inn2346378""",74,439088.044434,12,2023,4,4,0,2,0.0,0,"""45""","""45""",1,"""74""",192.0,192.0
"""inn901113""",63,1.5958e6,10,2023,2,4,0,2,7.64368064e8,1,"""46""","""46""",1,"""47""",192.0,192.0
"""inn1388807""",62,344427.4021,10,2023,1,4,0,2,1.093919e7,0,"""52""","""52""",1,"""31""",192.0,192.0


In [None]:
groupper2(train, groups, mode, postfix)

In [None]:
train = get_train(dir)
test = get_test(dir)
val = get_val(dir)
train_val = pl.concat([train, val])

  pl.col("diff_datopen_report_date_flg").replace(age, default=np.NAN),
  pl.col("diff_datopen_report_date_flg").replace(age, default=np.NAN),
  pl.col("diff_datopen_report_date_flg").replace(age, default=np.NAN),


In [None]:
print(train.columns)

['inn_id', 'week', 'target', 'month', 'year', 'week_of_month', 'quarter', 'holidays_in_week', 'weekends_in_week', 'ipul', 'id_region_f', 'id_region_l', 'id_region_u', 'main_okved_group', 'diff_datopen_report_date_flg', 'diff_datopen_report_date_flg_min', 'out_vtb_trns_count_week_of_month', 'out_vtb_trns_amount_week_of_month', 'out_vtb_avg_amount_week_of_month', 'out_avg_amount_week_of_month', 'out_std_amount_week_of_month', 'out_median_amount_week_of_month', 'out_p90_amount_week_of_month', 'out_p75_amount_week_of_month', 'out_p5_amount_week_of_month', 'out_p25_amount_week_of_month', 'out_p10_amount_week_of_month', 'out_active_days_week_of_month', 'out_trns_types_week_of_month', 'out_common_trns_type_week_of_month', 'out_large_trns_count_week_of_month', 'out_small_trns_count_week_of_month', 'out_vtb_trns_class_encoded_week_of_month', 'out_vtb_date_week_of_month', 'out_vtb_trns_count_month', 'out_vtb_trns_amount_month', 'out_vtb_avg_amount_month', 'out_avg_amount_month', 'out_std_amount_

In [None]:
from catboost import CatBoostRegressor, Pool
cat_cols = ['inn_id', 'month', 'year', 'week_of_month', 'quarter', 'ipul', 'id_region_f', 'id_region_l', 'id_region_u', 'main_okved_group', 'main_okved_group']

In [None]:
train.write_parquet('train.parquet')
test.write_parquet('test.parquet')
val.write_parquet('val.parquet')
train_val.write_parquet('train_val.parquet')

In [None]:
train_df = train.to_pandas()
y_train = train_df['target']
X_train = train_df.drop(['target'], axis=1)
pool_train = Pool(data=X_train, label=y_train, cat_features=cat_cols)

In [None]:
val_df = val.to_pandas()
y_val = val_df['target']
X_val = val_df.drop(['target'], axis=1)
pool_val = Pool(data=X_val, label=y_val, cat_features=cat_cols)

In [None]:
model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.01,
    loss_function='RMSE',
    eval_metric='RMSE',
    verbose=100,  # печатать логи каждые 100 итераций
    task_type='GPU'
)

In [None]:
model.fit(
    pool_train,
    eval_set=pool_val,

    # early_stopping_rounds=100,
)

0:	learn: 10528731.1227668	test: 11239672.2276076	best: 11239672.2276076 (0)	total: 43.3ms	remaining: 2m 9s
100:	learn: 6673549.7936293	test: 7333505.4249604	best: 7333505.4249604 (100)	total: 4.13s	remaining: 1m 58s
200:	learn: 5326031.5326000	test: 6070290.9875108	best: 6070290.9875108 (200)	total: 8.31s	remaining: 1m 55s
300:	learn: 4765363.4777403	test: 5618224.3996920	best: 5618224.3996920 (300)	total: 12.6s	remaining: 1m 52s
400:	learn: 4452861.7784580	test: 5440589.0953420	best: 5440589.0953420 (400)	total: 16.9s	remaining: 1m 49s
500:	learn: 4262357.7742873	test: 5327549.8797680	best: 5327549.8797680 (500)	total: 21.3s	remaining: 1m 46s
600:	learn: 4125118.7920285	test: 5256481.1027630	best: 5256016.9017677 (596)	total: 25.7s	remaining: 1m 42s
700:	learn: 3994428.8857302	test: 5198260.9081082	best: 5198100.3753679 (699)	total: 30s	remaining: 1m 38s
800:	learn: 3886190.0928717	test: 5151921.3747288	best: 5151921.3747288 (800)	total: 34.4s	remaining: 1m 34s
900:	learn: 3803047.43

<catboost.core.CatBoostRegressor at 0x7aeb41b081c0>

In [None]:
feature_importance = model.get_feature_importance()
print("\nFeature importance:", feature_importance)


Feature importance: [4.85206865e-02 1.39966668e+00 1.63058799e-01 0.00000000e+00
 1.78185888e-01 0.00000000e+00 9.64081411e-02 0.00000000e+00
 2.94657610e-05 1.69688782e-01 2.80186923e-01 8.83589877e-02
 4.26485417e+00 3.50442739e-02 1.74366992e-02 1.53294518e+00
 3.88668571e+00 6.05382775e-01 7.12274234e-02 6.66052748e+00
 1.18472471e-01 5.23335632e-02 3.57841518e-02 1.70722729e-01
 2.55396688e-01 2.56657320e-01 3.09205632e-01 1.05897758e-01
 7.29936080e-02 6.49915644e+00 1.81148490e+00 1.13808353e-01
 4.74422846e-01 5.77793361e-02 5.76969965e+00 5.61516538e-01
 1.32443715e+00 1.79414506e+00 3.98618662e-02 1.10511512e-01
 1.87804930e-01 2.11247035e-01 1.32578211e-01 9.30184464e-02
 2.19352098e-01 1.93087199e-01 3.63348460e-03 1.77179887e+01
 1.27124843e+00 1.73533573e-01 1.94936254e-01 5.04470975e-03
 3.02876573e-03 9.63055302e-03 5.50470402e-04 5.40222616e-03
 4.75106748e-03 1.54719638e-03 1.91194712e-03 9.92101961e-03
 3.91881671e-02 1.08093517e-02 3.48993690e-03 5.78563717e-04
 0.

In [None]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_val)

y_test_pos = np.maximum(0, y_val)
y_pred_pos = np.maximum(0, y_pred)

rmsle = np.sqrt(mean_squared_log_error(y_test_pos, y_pred_pos))
print(f"RMSLE: {rmsle:.4f}")

RMSLE: 3.4151
