# ДАННЫЕ

In [None]:
import polars as pl
import polars.selectors as cs
import numpy as np

In [None]:
import holidays


def calendar_preprocess(dir):
    calendar = pl.scan_csv(dir + "calendar_extended.csv")
    ua_holidays = holidays.RU(years=[2022, 2023, 2024, 2025, 2026])

    calendar = calendar.with_columns(
        pl.col("date").str.strptime(pl.Date, format="%Y-%m-%d").alias("date_dt"),
        pl.col("date").map_elements(lambda x: int(x in ua_holidays)).alias("is_holiday"))
    calendar = calendar.with_columns(
        pl.col("date_dt").dt.weekday().is_in([6, 7]).cast(pl.Int8).alias("is_weekend"),
        pl.col("date_dt").dt.weekday().alias("week_day")
    )



    calendar = calendar.with_columns(
        # pl.col("date_dt").dt.day().alias("day_of_month"),
        pl.col("date_dt").dt.month().alias("month"),
        pl.col("date_dt").dt.year().alias("year"),
        ((pl.col("date_dt").dt.day() - 1) // 7 + 1).alias("week_of_month"),
        pl.col("is_holiday").sum().over("week").alias("holidays_in_week"),
        pl.col("is_weekend").sum().over("week").alias("weekends_in_week"),
        # pl.col("date_dt").dt.ordinal_day().alias("day_of_year"),
        pl.col("date_dt").dt.quarter().alias("quarter")
    )

    calendar = calendar.drop(["part", "is_holiday", "is_weekend", 'week_day']).with_columns(
        pl.col("week").cast(pl.Int16),
        pl.col("month").cast(pl.Int8),
        pl.col("week_of_month").cast(pl.Int8),
        pl.col("holidays_in_week").cast(pl.Int8),
        pl.col("quarter").cast(pl.Int8)
    )

    calendar = calendar.group_by("week").agg(
                pl.col("month").mode().first().alias("month"),
                pl.col("year").mode().first().alias("year"),
                pl.col("week_of_month").mode().first().alias("week_of_month"),
                pl.col("quarter").mode().first().alias("quarter"),

                pl.col("holidays_in_week").first(),
                pl.col("weekends_in_week").first()
            ).with_columns(pl.concat_str(pl.col("month"), pl.col("week_of_month"), separator="-").alias("month_week"))
    return calendar


In [None]:
# calendar_preprocess(dir).collect().filter(pl.col('week') >= 118).sort(['year', 'month', 'week_of_month'])

In [None]:
# calendar_preprocess(dir).collect().filter(pl.col('week') == 106).sort(['year', 'month', 'week_of_month'])

In [None]:
def out_preprocessor(dir):
    inns = pl.read_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64})['inn_id'].unique()
    out_vtb_full=pl.scan_parquet(f"{dir}transactions_?.parquet").filter(pl.col("doc_payer_inn").is_in(inns)
                                                ).filter(pl.col("doc_payer_bank_name_flag")==1
                                                ).with_columns(pl.col("date").dt.date().alias("day"))
    out_vtb_full=out_vtb_full.join(pl.scan_csv(dir + "calendar_extended.csv").with_columns(pl.col('date').str.strptime(pl.Date, format="%Y-%m-%d").alias("date_dt")),
                        left_on=["day"],
                        right_on=["date_dt"],
                        how="left" )
    out_vtb = out_vtb_full.group_by("doc_payer_inn").agg(
                        pl.col("trns_count").sum().alias("out_vtb_trns_count_w"),
                        # pl.col("trns_amount").sum().alias("out_vtb_trns_amount_w"),
                        # (pl.col("trns_amount").sum() / pl.col("trns_count").sum()).alias("out_vtb_avg_amount_w"),
                        pl.col("date").n_unique().alias("out_vtb_date_w"),
                        pl.col("doc_payee_inn").n_unique().alias("out_vtb_cinn_w"),
                        pl.col("trns_class_encoded").n_unique().alias("out_vtb_trns_class_encoded_w"),
                        pl.col("doc_payer_bank_name_encoded").n_unique().alias("out_vtb_doc_payer_bank_name_encoded_w"),
                        pl.col("trns_count").last().alias("outl_vtb_trns_count_w"),
                    )



    return out_vtb


In [None]:
def in_preprocessor(dir):
    inns = pl.read_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64})['inn_id'].unique()
    in_vtb_full = (
    pl.scan_parquet(f"{dir}transactions_?.parquet")
    .filter(pl.col("doc_payee_inn").is_in(inns))
    .filter(pl.col("doc_payee_bank_name_flag") == 1)
    .with_columns(pl.col("date").dt.date().alias("day"))
    )

    in_vtb_full = in_vtb_full.join(
        pl.scan_csv(dir + "calendar_extended.csv").with_columns(pl.col('date').str.strptime(pl.Date, format="%Y-%m-%d").alias("date_dt")),
        left_on=["day"],
        right_on=["date_dt"],
        how="left"
    )

    in_vtb = (
        in_vtb_full.group_by("doc_payee_inn")
        .agg(
            pl.col("trns_count").sum().alias("in_vtb_trns_count_w"),
            pl.col("date").n_unique().alias("in_vtb_date_w"),
            pl.col("doc_payer_inn").n_unique().alias("in_vtb_cinn_w"),
            pl.col("trns_class_encoded").n_unique().alias("in_vtb_trns_class_encoded_w"),
            pl.col("doc_payee_bank_name_encoded").n_unique().alias("in_vtb_doc_payee_bank_name_encoded_w"),
            pl.col("trns_count").last().alias("inl_vtb_trns_count_w"),
        )
    )


    return in_vtb


In [None]:
def prepare_all(dir, calendar):
    inns = pl.read_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64})['inn_id'].unique()
    full = pl.concat([pl.scan_parquet('/kaggle/input/iatvoirotblat/target_series_extended.parquet'), pl.scan_parquet('/kaggle/input/iatvoirotblat/target_series (1).parquet')]
                    ).with_columns(pl.col("week").cast(pl.Int16)
                    ).filter(pl.col('inn_id').is_in(inns))
    full = full.join(calendar, on='week', how='left').unique()
    # full = full.join(pl.scan_parquet('/kaggle/input/iatvoirotblat/innweekstartsum.parquet').with_columns(pl.col("week").cast(pl.Int16)), on=['week', 'inn_id'], how='left')


    test = pl.scan_csv(dir + "sample_submit_extended.csv",schema_overrides={"predict": pl.Float64}).with_columns(pl.col("week").cast(pl.Int16)).rename({'predict': 'target'})
    test = test.join(calendar, on='week', how='left').unique()

    full = pl.concat([full, test])


    full = full.sort('week')

    full = full.with_columns(pl.col('target').log1p())


    return full

In [None]:
def pinf(dir, train):
    profiles = pl.scan_parquet(dir + "profiles_extended.parquet")
    age={"1m":1, "2_3m":3, "3_6m":6, "6_12m":12, "1_2y":24, "2_4y":48, "4_8y":96, "8_12y":144, "more_12y":192}
    ipul={"ip":0,"ul":1,}
    profiles = profiles.with_columns(
        pl.col("report_date").str.to_date(),
        pl.col("diff_datopen_report_date_flg").replace(age, default=np.NAN),
        pl.col("ipul").replace(ipul).cast(pl.Int8),
        pl.col("id_region").fill_null("-"),
        pl.col("main_okved_group").fill_null("-"),
        pl.col("inn_id").str.replace("inn","").cast(pl.Int32).alias("inn"),
    )
    profiles = profiles.sort("report_date").group_by("inn_id"
                                        ).agg(pl.col("ipul").first(),
                                            pl.col("id_region").first().alias("id_region_f"),
                                              pl.col("id_region").last().alias('id_region_l'),
                                              pl.col("id_region").n_unique().alias('id_region_u'),
                                            pl.col("main_okved_group").last(),
                                            pl.col("diff_datopen_report_date_flg").max(),
                                            pl.col("diff_datopen_report_date_flg").min().alias("diff_datopen_report_date_flg_min"),
                                           ).join(profiles.group_by("inn_id"
                                                                   ).agg(
                                            pl.col("report_date").min().alias("report_date_min"),
                                                                         ),
                                                  on="inn_id",
                                                  how="left")
    train = train.join(profiles, on='inn_id', how='left')
    return train

In [None]:
def groupper2(train, groups, mode, postfix):
    groupped = train.group_by(groups).agg(
        pl.col("target").mean().alias(f"{mode}_avg_amount_{postfix}"),
        pl.col("target").median().alias(f"{mode}_median_amount_{postfix}"),
        pl.col("target").quantile(0.9).alias(f"{mode}_p90_amount_{postfix}"),
        pl.col("target").quantile(0.75).alias(f"{mode}_p75_amount_{postfix}"),
        pl.col("target").quantile(0.5).alias(f"{mode}_p5_amount_{postfix}"),
        pl.col("target").quantile(0.25).alias(f"{mode}_p25_amount_{postfix}"),
        pl.col("target").quantile(0.1).alias(f"{mode}_p10_amount_{postfix}"),

        (pl.col("target") > 500_000).sum().alias(f"{mode}_large_trns_count_{postfix}"),
        (pl.col("target") < 2_000).sum().alias(f"{mode}_small_trns_count_{postfix}"),
        (pl.col("target") > 500_000).mean().alias(f"{mode}_large_trns_p_{postfix}"),
        (pl.col("target") < 2_000).mean().alias(f"{mode}_small_trns_p_{postfix}"),
        )
    return groupped

In [None]:
def target_cumsum_grouper(dir, train):
    train_inn = groupper2(train, ["inn_id", "week"], 'week', 'inn').sort('week')
    cols = ['week_avg_amount_inn','week_median_amount_inn','week_p90_amount_inn','week_p75_amount_inn','week_p5_amount_inn','week_p25_amount_inn','week_p10_amount_inn','week_large_trns_count_inn','week_small_trns_count_inn','week_large_trns_p_inn','week_small_trns_p_inn']
    train_inn = train_inn.with_columns(cs.by_name(cols).shift(1).over("inn_id").name.suffix("_1"),
                                       cs.by_name(cols).shift(2).over("inn_id").name.suffix("_2"),
                                       cs.by_name(cols).shift(3).over("inn_id").name.suffix("_3"),
                                       # cs.by_name(cols).shift(4).over("inn_id").name.suffix("_4"),
                                       # cs.by_name(cols).shift(5).over("inn_id").name.suffix("_5"),
                                       # cs.by_name(cols).shift(6).over("inn_id").name.suffix("_6"),
                                       # cs.by_name(cols).shift(7).over("inn_id").name.suffix("_7"),
                                       # cs.by_name(cols).shift(8).over("inn_id").name.suffix("_8"),
                                       # cs.by_name(cols).shift(9).over("inn_id").name.suffix("_9"),
                                       # cs.by_name(cols).shift(10).over("inn_id").name.suffix("_10"),
                                       # cs.by_name(cols).shift(11).over("inn_id").name.suffix("_11"),
                                       # cs.by_name(cols).shift(12).over("inn_id").name.suffix("_12"),
                                       # cs.by_name(cols).shift(13).over("inn_id").name.suffix("_13"),
                                       # cs.by_name(cols).shift(14).over("inn_id").name.suffix("_14"),
                                       # cs.by_name(cols).shift(15).over("inn_id").name.suffix("_15"),
                                       # cs.by_name(cols).shift(16).over("inn_id").name.suffix("_16"),
                                       # cs.by_name(cols).shift(17).over("inn_id").name.suffix("_17"),
                                       # cs.by_name(cols).shift(18).over("inn_id").name.suffix("_18"),
                                       # cs.by_name(cols).shift(19).over("inn_id").name.suffix("_19"),
                                       # cs.by_name(cols).shift(20).over("inn_id").name.suffix("_20"),
                                        ).drop(cols)
    train = train.join(train_inn, on=['inn_id', 'week'], how='left')

    train_inn = groupper2(train, ["inn_id", "week_of_month"], 'week_of_month', 'inn').sort('week_of_month')
    cols = ['week_of_month_avg_amount_inn','week_of_month_median_amount_inn','week_of_month_p90_amount_inn','week_of_month_p75_amount_inn','week_of_month_p5_amount_inn','week_of_month_p25_amount_inn','week_of_month_p10_amount_inn','week_of_month_large_trns_count_inn','week_of_month_small_trns_count_inn','week_of_month_large_trns_p_inn','week_of_month_small_trns_p_inn']
    train_inn = train_inn.with_columns(cs.by_name(cols).shift(1).over("inn_id").name.suffix("_1"),
                                       # cs.by_name(cols).shift(2).over("inn_id").name.suffix("_2"),
                                       # cs.by_name(cols).shift(3).over("inn_id").name.suffix("_3"),
                                       # cs.by_name(cols).shift(4).over("inn_id").name.suffix("_4"),
                                       # cs.by_name(cols).shift(5).over("inn_id").name.suffix("_5"),
                                       # cs.by_name(cols).shift(6).over("inn_id").name.suffix("_6"),
                                       # cs.by_name(cols).shift(7).over("inn_id").name.suffix("_7"),
                                       # cs.by_name(cols).shift(8).over("inn_id").name.suffix("_8"),
                                       # cs.by_name(cols).shift(9).over("inn_id").name.suffix("_9"),
                                       # cs.by_name(cols).shift(10).over("inn_id").name.suffix("_10"),
                                       # cs.by_name(cols).shift(11).over("inn_id").name.suffix("_11"),
                                       # cs.by_name(cols).shift(12).over("inn_id").name.suffix("_12"),
                                        ).drop(cols)
    train = train.join(train_inn, on=['inn_id', 'week_of_month'], how='left')

    train_inn = groupper2(train, ["inn_id", "month"], 'month', 'inn').sort('month')
    cols = ['month_avg_amount_inn','month_median_amount_inn','month_p90_amount_inn','month_p75_amount_inn','month_p5_amount_inn','month_p25_amount_inn','month_p10_amount_inn','month_large_trns_count_inn','month_small_trns_count_inn','month_large_trns_p_inn','month_small_trns_p_inn']
    train_inn = train_inn.with_columns(cs.by_name(cols).shift(1).over("inn_id").name.suffix("_1"),)
    train = train.join(train_inn, on=['inn_id', 'month'], how='left')




    # train_inn = groupper2(train.sort('week'), ["inn_id", "month_week"], 'wmonth', 'inn')
    # cols = ['wmonth_avg_amount_inn','wmonth_median_amount_inn','wmonth_p90_amount_inn','wmonth_p75_amount_inn','wmonth_p5_amount_inn','wmonth_p25_amount_inn','wmonth_p10_amount_inn','wmonth_large_trns_count_inn','wmonth_small_trns_count_inn','wmonth_large_trns_p_inn','wmonth_small_trns_p_inn']
    train = train.with_columns(
                                cs.by_name(cols).shift(1).over(["inn_id", "month_week"]).name.suffix("_1"),
                                )
    # train_inn = groupper2(train.sort('week'), ["inn_id", "week"], 'xweek', 'inn').sort('xweek')
    # cols = ['xweek_avg_amount_inn','xweek_median_amount_inn','xweek_p90_amount_inn','xweek_p75_amount_inn','xweek_p5_amount_inn','xweek_p25_amount_inn','xweek_p10_amount_inn','xweek_large_trns_count_inn','xweek_small_trns_count_inn','xweek_large_trns_p_inn','xweek_small_trns_p_inn']
    train = train.with_columns(
                                cs.by_name('target').shift(1).over(["inn_id"]).name.suffix("_1"),
                                cs.by_name('target').shift(2).over(["inn_id"]).name.suffix("_2"),
                                cs.by_name('target').shift(3).over(["inn_id"]).name.suffix("_3"),
                                )





    train_inn = groupper2(train, ["week"], 'week', 'full').sort('week')
    cols = ['week_avg_amount_full','week_median_amount_full','week_p90_amount_full','week_p75_amount_full','week_p5_amount_full','week_p25_amount_full','week_p10_amount_full','week_large_trns_count_full','week_small_trns_count_full','week_large_trns_p_full','week_small_trns_p_full']
    train_inn = train_inn.with_columns(cs.by_name(cols).shift(1).name.suffix("_1"),
                               cs.by_name(cols).shift(2).name.suffix("_2"),
                               cs.by_name(cols).shift(3).name.suffix("_3"),
                               # cs.by_name(cols).shift(4).name.suffix("_4"),
                               # cs.by_name(cols).shift(5).name.suffix("_5"),
                               # cs.by_name(cols).shift(6).name.suffix("_6"),
                               # cs.by_name(cols).shift(7).name.suffix("_7"),
                               # cs.by_name(cols).shift(8).name.suffix("_8"),
                               # cs.by_name(cols).shift(9).name.suffix("_9"),
                               # cs.by_name(cols).shift(10).name.suffix("_10"),
                               # cs.by_name(cols).shift(11).name.suffix("_11"),
                               # cs.by_name(cols).shift(12).name.suffix("_12"),
                               # cs.by_name(cols).shift(13).name.suffix("_13"),
                               # cs.by_name(cols).shift(14).name.suffix("_14"),
                               # cs.by_name(cols).shift(15).name.suffix("_15"),
                               # cs.by_name(cols).shift(16).name.suffix("_16"),
                               # cs.by_name(cols).shift(17).name.suffix("_17"),
                               # cs.by_name(cols).shift(18).name.suffix("_18"),
                               # cs.by_name(cols).shift(19).name.suffix("_19"),
                               # cs.by_name(cols).shift(20).name.suffix("_20"),
                                )
    train = train.join(train_inn, on=['week'], how='left')

    train_inn = groupper2(train, ["week_of_month"], 'week_of_month', 'full').sort('week_of_month')
    cols = ['week_of_month_avg_amount_full','week_of_month_median_amount_full','week_of_month_p90_amount_full','week_of_month_p75_amount_full','week_of_month_p5_amount_full','week_of_month_p25_amount_full','week_of_month_p10_amount_full','week_of_month_large_trns_count_full','week_of_month_small_trns_count_full','week_of_month_large_trns_p_full','week_of_month_small_trns_p_full']
    train_inn = train_inn.with_columns(cs.by_name(cols).shift(1).name.suffix("_1"),
                               # cs.by_name(cols).shift(2).name.suffix("_2"),
                               # cs.by_name(cols).shift(3).name.suffix("_3"),
                               # cs.by_name(cols).shift(4).name.suffix("_4"),
                               # cs.by_name(cols).shift(5).name.suffix("_5"),
                               # cs.by_name(cols).shift(6).name.suffix("_6"),
                               # cs.by_name(cols).shift(7).name.suffix("_7"),
                               # cs.by_name(cols).shift(8).name.suffix("_8"),
                               # cs.by_name(cols).shift(9).name.suffix("_9"),
                               # cs.by_name(cols).shift(10).name.suffix("_10"),
                               # cs.by_name(cols).shift(11).name.suffix("_11"),
                               # cs.by_name(cols).shift(12).name.suffix("_12"),
                                )
    train = train.join(train_inn, on=['week_of_month'], how='left')

    train_inn = groupper2(train, ["month"], 'month', 'full').sort('month')
    cols = ['month_avg_amount_full','month_median_amount_full','month_p90_amount_full','month_p75_amount_full','month_p5_amount_full','month_p25_amount_full','month_p10_amount_full','month_large_trns_count_full','month_small_trns_count_full','month_large_trns_p_full','month_small_trns_p_full']
    train_inn = train_inn.with_columns(cs.by_name(cols).shift(1).name.suffix("_1"))
    train = train.join(train_inn, on=['month'], how='left')


    return train

In [None]:
# pl.scan_csv(dir + "calendar_extended.csv").with_columns(pl.col('date').str.strptime(pl.Date, format="%Y-%m-%d").alias("date_dt")).collect()

In [None]:
dir = '/kaggle/input/iatvoirotblat/'
# year, month, day = 2000, 10, 1

In [None]:
%%time
calendar = calendar_preprocess(dir)
train = prepare_all(dir, calendar)
train = pinf(dir, train).drop('report_date_min')
outer = out_preprocessor(dir)
inner = in_preprocessor(dir)
train = train.join(outer, left_on=['inn_id'], right_on=['doc_payer_inn']).join(inner, left_on=['inn_id'], right_on=['doc_payee_inn'])
train = target_cumsum_grouper(dir, train)

train = train.collect()

  pl.col("diff_datopen_report_date_flg").replace(age, default=np.NAN),


CPU times: user 57min 17s, sys: 13min 17s, total: 1h 10min 34s
Wall time: 24min 8s


In [None]:
train

inn_id,week,target,month,year,week_of_month,quarter,holidays_in_week,weekends_in_week,month_week,ipul,id_region_f,id_region_l,id_region_u,main_okved_group,diff_datopen_report_date_flg,diff_datopen_report_date_flg_min,out_vtb_trns_count_w,out_vtb_date_w,out_vtb_cinn_w,out_vtb_trns_class_encoded_w,out_vtb_doc_payer_bank_name_encoded_w,outl_vtb_trns_count_w,in_vtb_trns_count_w,in_vtb_date_w,in_vtb_cinn_w,in_vtb_trns_class_encoded_w,in_vtb_doc_payee_bank_name_encoded_w,inl_vtb_trns_count_w,week_avg_amount_inn_1,week_median_amount_inn_1,week_p90_amount_inn_1,week_p75_amount_inn_1,week_p5_amount_inn_1,week_p25_amount_inn_1,week_p10_amount_inn_1,week_large_trns_count_inn_1,…,week_of_month_large_trns_count_full,week_of_month_small_trns_count_full,week_of_month_large_trns_p_full,week_of_month_small_trns_p_full,week_of_month_avg_amount_full_1,week_of_month_median_amount_full_1,week_of_month_p90_amount_full_1,week_of_month_p75_amount_full_1,week_of_month_p5_amount_full_1,week_of_month_p25_amount_full_1,week_of_month_p10_amount_full_1,week_of_month_large_trns_count_full_1,week_of_month_small_trns_count_full_1,week_of_month_large_trns_p_full_1,week_of_month_small_trns_p_full_1,month_avg_amount_full,month_median_amount_full,month_p90_amount_full,month_p75_amount_full,month_p5_amount_full,month_p25_amount_full,month_p10_amount_full,month_large_trns_count_full,month_small_trns_count_full,month_large_trns_p_full,month_small_trns_p_full,month_avg_amount_full_1,month_median_amount_full_1,month_p90_amount_full_1,month_p75_amount_full_1,month_p5_amount_full_1,month_p25_amount_full_1,month_p10_amount_full_1,month_large_trns_count_full_1,month_small_trns_count_full_1,month_large_trns_p_full_1,month_small_trns_p_full_1
str,i16,f64,i8,i32,i8,i8,i8,i64,str,i8,str,str,u32,str,f64,f64,f64,u32,u32,u32,u32,f64,f64,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,…,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64
"""inn3985525""",0,14.001552,7,2022,4,3,0,2,"""7-4""",1,"""1""","""1""",1,"""41""",96.0,96.0,764.0,291,125,9,2,1.0,927.0,377,113,11,3,1.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
"""inn2124852""",0,15.805398,7,2022,4,3,0,2,"""7-4""",1,"""75""","""75""",1,"""46""",144.0,144.0,1163.0,453,113,5,2,6.0,1638.0,544,331,6,3,3.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
"""inn1842381""",0,10.804235,7,2022,4,3,0,2,"""7-4""",1,"""22""","""22""",1,"""45""",192.0,192.0,699.0,363,94,3,4,2.0,175.0,110,28,3,1,1.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
"""inn1758437""",0,15.824165,7,2022,4,3,0,2,"""7-4""",1,"""20""","""20""",1,"""10""",192.0,192.0,8797.0,658,371,8,3,3.0,3057.0,601,90,6,3,1.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
"""inn2256320""",0,0.0,7,2022,4,3,0,2,"""7-4""",0,"""38""","""38""",1,"""47""",24.0,12.0,1692.0,430,65,4,1,1.0,1423.0,431,14,4,1,3.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""inn1125328""",129,14.185445,1,2025,3,1,0,2,"""1-3""",1,"""28""","""28""",1,"""27""",192.0,192.0,4041.0,580,289,8,6,1.0,4686.0,615,782,12,4,4.0,14.185445,14.185445,14.185445,14.185445,14.185445,14.185445,14.185445,0,…,0,1434780,0.0,1.0,12.344928,13.083172,15.406107,14.38578,13.083173,11.658533,9.706682,0,1434780,0.0,1.0,10.765839,12.564663,15.231135,14.088001,12.564663,10.293335,0.0,0,526086,0.0,1.0,,,,,,,,,,,
"""inn2871423""",129,13.542835,1,2025,3,1,0,2,"""1-3""",1,"""92""","""92""",1,"""06""",192.0,192.0,650.0,205,94,11,1,4.0,279.0,128,28,8,2,1.0,13.542835,13.542835,13.542835,13.542835,13.542835,13.542835,13.542835,0,…,0,1434780,0.0,1.0,12.344928,13.083172,15.406107,14.38578,13.083173,11.658533,9.706682,0,1434780,0.0,1.0,10.765839,12.564663,15.231135,14.088001,12.564663,10.293335,0.0,0,526086,0.0,1.0,,,,,,,,,,,
"""inn1090976""",129,13.463813,1,2025,3,1,0,2,"""1-3""",1,"""29""","""29""",1,"""28""",144.0,144.0,3244.0,509,285,9,4,1.0,984.0,405,244,10,3,1.0,13.463813,13.463813,13.463813,13.463813,13.463813,13.463813,13.463813,0,…,0,1434780,0.0,1.0,12.344928,13.083172,15.406107,14.38578,13.083173,11.658533,9.706682,0,1434780,0.0,1.0,10.765839,12.564663,15.231135,14.088001,12.564663,10.293335,0.0,0,526086,0.0,1.0,,,,,,,,,,,
"""inn1088141""",129,14.493907,1,2025,3,1,0,2,"""1-3""",1,"""14""","""14""",1,"""46""",144.0,144.0,801.0,372,68,5,3,2.0,800.0,341,30,5,2,1.0,14.493907,14.493907,14.493907,14.493907,14.493907,14.493907,14.493907,0,…,0,1434780,0.0,1.0,12.344928,13.083172,15.406107,14.38578,13.083173,11.658533,9.706682,0,1434780,0.0,1.0,10.765839,12.564663,15.231135,14.088001,12.564663,10.293335,0.0,0,526086,0.0,1.0,,,,,,,,,,,


In [None]:
train.write_parquet('train_final_final_full.parquet')

In [None]:
train

inn_id,week,target,month,year,week_of_month,quarter,holidays_in_week,weekends_in_week,month_week,ipul,id_region_f,id_region_l,id_region_u,main_okved_group,diff_datopen_report_date_flg,diff_datopen_report_date_flg_min,out_vtb_trns_count_w,out_vtb_date_w,out_vtb_cinn_w,out_vtb_trns_class_encoded_w,out_vtb_doc_payer_bank_name_encoded_w,outl_vtb_trns_count_w,in_vtb_trns_count_w,in_vtb_date_w,in_vtb_cinn_w,in_vtb_trns_class_encoded_w,in_vtb_doc_payee_bank_name_encoded_w,inl_vtb_trns_count_w,week_avg_amount_inn_1,week_median_amount_inn_1,week_p90_amount_inn_1,week_p75_amount_inn_1,week_p5_amount_inn_1,week_p25_amount_inn_1,week_p10_amount_inn_1,week_large_trns_count_inn_1,…,week_of_month_large_trns_count_full,week_of_month_small_trns_count_full,week_of_month_large_trns_p_full,week_of_month_small_trns_p_full,week_of_month_avg_amount_full_1,week_of_month_median_amount_full_1,week_of_month_p90_amount_full_1,week_of_month_p75_amount_full_1,week_of_month_p5_amount_full_1,week_of_month_p25_amount_full_1,week_of_month_p10_amount_full_1,week_of_month_large_trns_count_full_1,week_of_month_small_trns_count_full_1,week_of_month_large_trns_p_full_1,week_of_month_small_trns_p_full_1,month_avg_amount_full,month_median_amount_full,month_p90_amount_full,month_p75_amount_full,month_p5_amount_full,month_p25_amount_full,month_p10_amount_full,month_large_trns_count_full,month_small_trns_count_full,month_large_trns_p_full,month_small_trns_p_full,month_avg_amount_full_1,month_median_amount_full_1,month_p90_amount_full_1,month_p75_amount_full_1,month_p5_amount_full_1,month_p25_amount_full_1,month_p10_amount_full_1,month_large_trns_count_full_1,month_small_trns_count_full_1,month_large_trns_p_full_1,month_small_trns_p_full_1
str,i16,f64,i8,i32,i8,i8,i8,i64,str,i8,str,str,u32,str,f64,f64,f64,u32,u32,u32,u32,f64,f64,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,…,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64
"""inn3985525""",0,14.001552,7,2022,4,3,0,2,"""7-4""",1,"""1""","""1""",1,"""41""",96.0,96.0,764.0,291,125,9,2,1.0,927.0,377,113,11,3,1.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
"""inn2124852""",0,15.805398,7,2022,4,3,0,2,"""7-4""",1,"""75""","""75""",1,"""46""",144.0,144.0,1163.0,453,113,5,2,6.0,1638.0,544,331,6,3,3.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
"""inn1842381""",0,10.804235,7,2022,4,3,0,2,"""7-4""",1,"""22""","""22""",1,"""45""",192.0,192.0,699.0,363,94,3,4,2.0,175.0,110,28,3,1,1.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
"""inn1758437""",0,15.824165,7,2022,4,3,0,2,"""7-4""",1,"""20""","""20""",1,"""10""",192.0,192.0,8797.0,658,371,8,3,3.0,3057.0,601,90,6,3,1.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
"""inn2256320""",0,0.0,7,2022,4,3,0,2,"""7-4""",0,"""38""","""38""",1,"""47""",24.0,12.0,1692.0,430,65,4,1,1.0,1423.0,431,14,4,1,3.0,,,,,,,,,…,0,1626084,0.0,1.0,12.429082,13.155168,15.46232,14.449855,13.15517,11.735994,9.832377,0,1434780,0.0,1.0,12.471947,13.169111,15.463917,14.458606,13.169123,11.756901,9.888787,0,430434,0.0,1.0,12.64034,13.238802,15.498859,14.506629,13.238805,11.874318,10.2062,0,430434,0.0,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""inn1125328""",129,14.185445,1,2025,3,1,0,2,"""1-3""",1,"""28""","""28""",1,"""27""",192.0,192.0,4041.0,580,289,8,6,1.0,4686.0,615,782,12,4,4.0,14.185445,14.185445,14.185445,14.185445,14.185445,14.185445,14.185445,0,…,0,1434780,0.0,1.0,12.344928,13.083172,15.406107,14.38578,13.083173,11.658533,9.706682,0,1434780,0.0,1.0,10.765839,12.564663,15.231135,14.088001,12.564663,10.293335,0.0,0,526086,0.0,1.0,,,,,,,,,,,
"""inn2871423""",129,13.542835,1,2025,3,1,0,2,"""1-3""",1,"""92""","""92""",1,"""06""",192.0,192.0,650.0,205,94,11,1,4.0,279.0,128,28,8,2,1.0,13.542835,13.542835,13.542835,13.542835,13.542835,13.542835,13.542835,0,…,0,1434780,0.0,1.0,12.344928,13.083172,15.406107,14.38578,13.083173,11.658533,9.706682,0,1434780,0.0,1.0,10.765839,12.564663,15.231135,14.088001,12.564663,10.293335,0.0,0,526086,0.0,1.0,,,,,,,,,,,
"""inn1090976""",129,13.463813,1,2025,3,1,0,2,"""1-3""",1,"""29""","""29""",1,"""28""",144.0,144.0,3244.0,509,285,9,4,1.0,984.0,405,244,10,3,1.0,13.463813,13.463813,13.463813,13.463813,13.463813,13.463813,13.463813,0,…,0,1434780,0.0,1.0,12.344928,13.083172,15.406107,14.38578,13.083173,11.658533,9.706682,0,1434780,0.0,1.0,10.765839,12.564663,15.231135,14.088001,12.564663,10.293335,0.0,0,526086,0.0,1.0,,,,,,,,,,,
"""inn1088141""",129,14.493907,1,2025,3,1,0,2,"""1-3""",1,"""14""","""14""",1,"""46""",144.0,144.0,801.0,372,68,5,3,2.0,800.0,341,30,5,2,1.0,14.493907,14.493907,14.493907,14.493907,14.493907,14.493907,14.493907,0,…,0,1434780,0.0,1.0,12.344928,13.083172,15.406107,14.38578,13.083173,11.658533,9.706682,0,1434780,0.0,1.0,10.765839,12.564663,15.231135,14.088001,12.564663,10.293335,0.0,0,526086,0.0,1.0,,,,,,,,,,,
