In [8]:
import numpy as np
import pandas as pd
from math import sin,cos,pi
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder as le_enc
from sklearn.model_selection import KFold,train_test_split

In [9]:
data_train = pd.read_csv('/kaggle/input/vseros-ai-otb/train-2/train.csv',parse_dates=['agreement_date'])
data_test = pd.read_csv('/kaggle/input/vseros-ai-otb/test-3/test.csv',parse_dates=['agreement_date'])
sample_sub = pd.read_csv('/kaggle/input/vseros-ai-otb/sample_submission-7.csv')

In [10]:
def make_base_time_features(df):
    df['month'] = df['agreement_date'].apply(lambda x:x.month)
    df['day'] = df['agreement_date'].apply(lambda x:x.day)
    df['week'] = df['agreement_date'].apply(lambda x:x.week)
    df['quarter'] = df['agreement_date'].apply(lambda x:x.quarter)
    df['year'] = df['agreement_date'].apply(lambda x: x.year - 2012)
    
    df['weekofyear'] = df['agreement_date'].apply(lambda x:x.weekofyear)
    df['dayofweek'] = df['agreement_date'].apply(lambda x:x.dayofweek)
    df['dayofyear'] = df['agreement_date'].apply(lambda x:x.dayofyear)
    df['week'] = df['agreement_date'].apply(lambda x:x.week)
    
    df['all_time'] = df['year'] * 365 + df['dayofyear']
    df['all_week'] = df['year'] * 52 + df['weekofyear']
    return df

def get_polynoms_from_column(df,col):
    min_v = df[col].min()
    max_v = df[col].max()
    
    df[f'sin_{col}'] = df[col].apply(sin)
    df[f'cos_{col}'] = df[col].apply(cos)
    df[f'sin_{col}^2'] = df[col].apply(sin) * df[col].apply(sin)
    df[f'cos_{col}^2'] = df[col].apply(cos) * df[col].apply(cos)
    
    df[f'{col}_sin'] = df[col].apply(lambda x: sin((x - min_v) / max_v * 2 * pi))
    df[f'{col}_cos'] = df[col].apply(lambda x: cos((x - min_v) / max_v * 2 * pi))
    df[f'{col}_sin^2'] = df[f'{col}_sin'] * df[f'{col}_sin']
    df[f'{col}_cos^2'] = df[f'{col}_cos'] * df[f'{col}_cos']
    return df

def get_dop_features(df):
    day = 24
    year = 365.2425*day
    df['vday_sin'] = (df.hour * 2 * np.pi / day)
    df['vday_sin'] = df['vday_sin'].apply(sin)
    df['vday_cos'] = (df.hour * 2 * np.pi / day)
    df['vday_cos'] = df['vday_cos'].apply(cos)
    df['vyear_sin'] = (df.hour * 2 * np.pi / year)
    df['vyear_sin'] = df['vyear_sin'].apply(sin)
    df['vyear_cos'] = (df.hour * 2 * np.pi / year)
    df['vyear_cos'] = df['vyear_cos'].apply(cos)
    
    return df

In [11]:
def process(df):
    cat_cols = ['region_name_cat','district_cat','corpus_cat','developer_cat',
                'hc_name_cat','interior_cat','class_cat','stage_cat']
    df = make_base_time_features(df.fillna(-1))
    for col in ['month','day','dayofyear']:
        df = get_polynoms_from_column(df,col)
    df['interior_cat'] = df['interior_cat'].astype(int)
    df['rooms_4'] = df['rooms_4'].map(lambda x: 5 if x == '>=4' else x)
    df['rooms_4'] = df['rooms_4'].map(lambda x: 0 if x == 'студия' else x)
    for col in cat_cols:
        df[col] = le_enc().fit_transform(df[col])
    return df

In [12]:
train_df = process(data_train)
test_df = process(data_test)

In [13]:
train_df['rooms_4'] = train_df['rooms_4'].astype(float)

In [14]:
cat_cols = ['region_name_cat','district_cat','corpus_cat','developer_cat',
            'hc_name_cat','interior_cat','class_cat','stage_cat']
label_col = 'price_target'
drop_cols = ['agreement_date','price_target']
train_df,val_df = train_test_split(train_df,test_size=0.1,random_state=56)
features = list(train_df.drop(drop_cols,axis=1).columns)

In [15]:
lgbm_params = {'objective':'mape',
               'boosting':'dart',
               'extra_trees':False,
               'metric':'mape',
               'learning_rate':0.6,
               'num_leaves':256,
               'seed':45,
               'max_depth':16,
               'xgboost_dart_mode':True,
               'lambda_l1':0,
               'data_sample_strategy':'goss',
               'cat_smooth':1.0
              }

train_ds = lgb.Dataset(train_df.drop(['agreement_date','price_target'],axis=1),
                  label=train_df['price_target'],
                 categorical_feature=cat_cols
                      )

val_ds = lgb.Dataset(val_df.drop(['agreement_date','price_target'],axis=1),
                  label=val_df['price_target'],
                  categorical_feature=cat_cols
                    )

model = lgb.train({'metric':'mape'},
                   train_ds,
                   valid_sets=val_ds,
                   num_boost_round=1100,
                   early_stopping_rounds=100,
                   categorical_feature=cat_cols,
                   )



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13511
[LightGBM] [Info] Number of data points in the train set: 84109, number of used features: 115
[LightGBM] [Info] Start training from score 24333.661078
[1]	valid_0's mape: 0.231784
Training until validation scores don't improve for 100 rounds
[2]	valid_0's mape: 0.210575
[3]	valid_0's mape: 0.191609
[4]	valid_0's mape: 0.174643




[5]	valid_0's mape: 0.159444
[6]	valid_0's mape: 0.145933
[7]	valid_0's mape: 0.134063
[8]	valid_0's mape: 0.123524
[9]	valid_0's mape: 0.114133
[10]	valid_0's mape: 0.105883
[11]	valid_0's mape: 0.0985889
[12]	valid_0's mape: 0.0922056
[13]	valid_0's mape: 0.086553
[14]	valid_0's mape: 0.0814929
[15]	valid_0's mape: 0.0770735
[16]	valid_0's mape: 0.0733414
[17]	valid_0's mape: 0.0698811
[18]	valid_0's mape: 0.0668971
[19]	valid_0's mape: 0.0641131
[20]	valid_0's mape: 0.0618096
[21]	valid_0's mape: 0.0597017
[22]	valid_0's mape: 0.0577847
[23]	valid_0's mape: 0.0560518
[24]	valid_0's mape: 0.0546806
[25]	valid_0's mape: 0.0534827
[26]	valid_0's mape: 0.0521699
[27]	valid_0's mape: 0.0510931
[28]	valid_0's mape: 0.0501306
[29]	valid_0's mape: 0.0493461
[30]	valid_0's mape: 0.0485996
[31]	valid_0's mape: 0.0479354
[32]	valid_0's mape: 0.0472786
[33]	valid_0's mape: 0.0466745
[34]	valid_0's mape: 0.045941
[35]	valid_0's mape: 0.0452901
[36]	valid_0's mape: 0.0446577
[37]	valid_0's mape: 