# let's go!
- https://ods.ai/tracks/linear-models-spring23/competitions/gates

In [6]:
# imports
import warnings
warnings.filterwarnings("ignore")

import string
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import datetime

from catboost import CatBoostClassifier, Pool

In [2]:
# чтение и обработка данных
def read_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, parse_dates=['timestamp'])
    # раскручиваем дату
    df['date']= df['timestamp'].dt.date 
    df['weekday']=df['timestamp'].dt.weekday 
    df['hour']=df['timestamp'].dt.hour  
    df['minute'] = df['timestamp'].dt.minute  
    df['sec'] = df['timestamp'].dt.second
    df['time']= df['hour']*3600 + df['minute']*60 + df['sec'] # время дня в секундах
    df['timestamp']=df['timestamp'].view('int64')//(10**9) # timestamp 
    # мастерим id дня
    df['day_id'] = df['timestamp']//(60*60*24)
    return df.drop(['minute', 'sec', 'hour', 'timestamp'], axis=1)

test = read_data('test.csv')
train = read_data('train.csv')
train.head(3)

Unnamed: 0,row_id,user_id,gate_id,date,weekday,time,day_id
0,0,18,7,2022-07-29,4,32934,19202
1,1,18,9,2022-07-29,4,32994,19202
2,2,18,9,2022-07-29,4,32994,19202


## определим индексы лимитов по месяцам

In [3]:
# limits
ind_sept = datetime.datetime.strptime('2022-09-01', '%Y-%m-%d').date()
ind_oct = datetime.datetime.strptime('2022-10-01', '%Y-%m-%d').date()
ind_nov = datetime.datetime.strptime('2022-11-01', '%Y-%m-%d').date()
ind_dec = datetime.datetime.strptime('2022-12-01', '%Y-%m-%d').date()

limit_sept = train[train['date']<ind_sept].shape[0]
limit_oct = train[train['date']<ind_oct].shape[0]
limit_nov = train[train['date']<ind_nov].shape[0]
limit_dec = train[train['date']<ind_dec].shape[0]
limit_train = 37518

print (limit_sept, limit_oct, limit_nov, limit_dec, limit_train)  
# 11202 19596 26834 30876 37518

11202 19596 26834 30876 37518


## Создаём общий датасет

In [8]:
# готовим датасеты для слияния
test['user_id']=-7
# слияем 
df = pd.concat((train, test), axis = 0)
print(' df cols  ', df.columns)
print(' df shape   ', df.shape)  # (44643, 7)
df.head(3)

 df cols   Index(['row_id', 'user_id', 'gate_id', 'date', 'weekday', 'time', 'day_id'], dtype='object')
 df shape    (44643, 7)


Unnamed: 0,row_id,user_id,gate_id,date,weekday,time,day_id
0,0,18,7,2022-07-29,4,32934,19202
1,1,18,9,2022-07-29,4,32994,19202
2,2,18,9,2022-07-29,4,32994,19202


## Переводим ворота в строки

In [9]:
# словарь перевода номера ворот в слово из трёх букв )))
# кодировать будем английским алфавитом
alphabet = string.ascii_lowercase
coder = {}
for i, char in enumerate (alphabet):
    coder[i-1]=char*3
# кодируем gate в строку
df['gates']=df['gate_id'].map(coder)
df = df.drop(['gate_id'], axis=1)
df.head()

Unnamed: 0,row_id,user_id,date,weekday,time,day_id,gates
0,0,18,2022-07-29,4,32934,19202,iii
1,1,18,2022-07-29,4,32994,19202,kkk
2,2,18,2022-07-29,4,32994,19202,kkk
3,3,18,2022-07-29,4,33006,19202,ggg
4,4,18,2022-07-29,4,33008,19202,ggg


## Добавляем тайминги и воротинги.)
- в количестве +-5

In [10]:
# следующие функции работают inplace
# смастерим ф-ю добавления тайминга
def add_diff_time(df:pd.DataFrame, step=1) -> None:
    prev_ = f'prev_time_{step}'
    next_ = f'next_time_{step}'
    df[prev_]=df['time'].shift(step)
    df[prev_] = df[prev_].fillna(df['time'].max()) 
    df[prev_] = df['time']-df[prev_]
    df[next_] = df['time'].shift(-step)
    df[next_] = df[next_].fillna(df['time'].min()) 
    df[next_] = df[next_] -df['time']
    # меняем все отрицательные тупо на -100000
    df.loc[df[prev_]<0, prev_]=-10000
    df.loc[df[next_]<0, next_]=-10000
    
    pass 

# функция добавления ворот
# работает только после добавки таймингов
def add_gates(df:pd.DataFrame, step=1) -> None:
    prev_ = f'prev_gate_{step}'
    next_ = f'next_gate_{step}'
    df[prev_]=df['gates'].shift(step)
    df[next_] = df['gates'].shift(-step)
    # меняем на none крайние ворота
    df.loc[df[f'prev_time_{step}']==-10000, prev_]='none'
    df.loc[df[f'next_time_{step}']==-10000, next_]='none'
    
    pass


# ф-я добавляет и timing и ворота
def add_timings_gates(df:pd.DataFrame, num = 5 ) -> None:
    # можно было одним циклом но сделаем 2
    for i in range(1,num+1):
        add_diff_time(df, step=i)
    for i in range(1,num+1):
        add_gates(df, step=i)
    pass

add_timings_gates(df, num = 5 )
print(df.columns)
df

Index(['row_id', 'user_id', 'date', 'weekday', 'time', 'day_id', 'gates',
       'prev_time_1', 'next_time_1', 'prev_time_2', 'next_time_2',
       'prev_time_3', 'next_time_3', 'prev_time_4', 'next_time_4',
       'prev_time_5', 'next_time_5', 'prev_gate_1', 'next_gate_1',
       'prev_gate_2', 'next_gate_2', 'prev_gate_3', 'next_gate_3',
       'prev_gate_4', 'next_gate_4', 'prev_gate_5', 'next_gate_5'],
      dtype='object')


Unnamed: 0,row_id,user_id,date,weekday,time,day_id,gates,prev_time_1,next_time_1,prev_time_2,...,prev_gate_1,next_gate_1,prev_gate_2,next_gate_2,prev_gate_3,next_gate_3,prev_gate_4,next_gate_4,prev_gate_5,next_gate_5
0,0,18,2022-07-29,4,32934,19202,iii,-10000.0,60.0,-10000.0,...,none,kkk,none,kkk,none,ggg,none,ggg,none,lll
1,1,18,2022-07-29,4,32994,19202,kkk,60.0,0.0,-10000.0,...,iii,kkk,none,ggg,none,ggg,none,lll,none,mmm
2,2,18,2022-07-29,4,32994,19202,kkk,0.0,12.0,60.0,...,kkk,ggg,iii,ggg,none,lll,none,mmm,none,fff
3,3,18,2022-07-29,4,33006,19202,ggg,12.0,2.0,12.0,...,kkk,ggg,kkk,lll,iii,mmm,none,fff,none,fff
4,4,18,2022-07-29,4,33008,19202,ggg,2.0,26.0,14.0,...,ggg,lll,kkk,mmm,kkk,fff,iii,fff,none,iii
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7120,44638,-7,2023-02-24,4,71016,19412,mmm,9279.0,24.0,9669.0,...,lll,fff,mmm,fff,kkk,kkk,kkk,kkk,fff,none
7121,44639,-7,2023-02-24,4,71040,19412,fff,24.0,1.0,9303.0,...,mmm,fff,lll,kkk,mmm,kkk,kkk,none,kkk,none
7122,44640,-7,2023-02-24,4,71041,19412,fff,1.0,8.0,25.0,...,fff,kkk,mmm,kkk,lll,none,mmm,none,kkk,none
7123,44641,-7,2023-02-24,4,71049,19412,kkk,8.0,0.0,9.0,...,fff,kkk,fff,none,mmm,none,lll,none,mmm,none


## Добавляем индивидуальную характеристику дня.

In [13]:
sorted(df.date.unique())

[datetime.date(2022, 7, 29),
 datetime.date(2022, 7, 30),
 datetime.date(2022, 7, 31),
 datetime.date(2022, 8, 1),
 datetime.date(2022, 8, 2),
 datetime.date(2022, 8, 3),
 datetime.date(2022, 8, 4),
 datetime.date(2022, 8, 5),
 datetime.date(2022, 8, 6),
 datetime.date(2022, 8, 7),
 datetime.date(2022, 8, 8),
 datetime.date(2022, 8, 9),
 datetime.date(2022, 8, 10),
 datetime.date(2022, 8, 11),
 datetime.date(2022, 8, 12),
 datetime.date(2022, 8, 15),
 datetime.date(2022, 8, 16),
 datetime.date(2022, 8, 17),
 datetime.date(2022, 8, 18),
 datetime.date(2022, 8, 19),
 datetime.date(2022, 8, 20),
 datetime.date(2022, 8, 21),
 datetime.date(2022, 8, 22),
 datetime.date(2022, 8, 23),
 datetime.date(2022, 8, 24),
 datetime.date(2022, 8, 25),
 datetime.date(2022, 8, 26),
 datetime.date(2022, 8, 27),
 datetime.date(2022, 8, 29),
 datetime.date(2022, 8, 30),
 datetime.date(2022, 8, 31),
 datetime.date(2022, 9, 1),
 datetime.date(2022, 9, 2),
 datetime.date(2022, 9, 3),
 datetime.date(2022, 9, 4)

In [14]:
# задача сформировать df -  дата, строка имён ворот чекавшихся в эту дату ,кол-во чеков
columns = ['date', 'gate_words', 'checks']
gates = pd.DataFrame(columns=columns)

# для каждой даты собираем данные
for date in sorted(df.date.unique()):
    # формируем датасет по каждой дате
    date_tr = df[df['date']==date]\
                    .sort_values(by=['time', 'row_id'])\
                    .reset_index(drop=True)
    gates.loc[gates.shape[0]]=[
        date, " ".join(date_tr.gates.values), date_tr.shape[0]
    ]
gates
print('gates.shape = ', gates.shape)
gates.head()

gates.shape =  (184, 3)


Unnamed: 0,date,gate_words,checks
0,2022-07-29,iii kkk kkk ggg ggg lll mmm fff fff iii kkk kk...,505
1,2022-07-30,iii kkk kkk ggg ggg lll mmm fff kkk kkk iii ii...,22
2,2022-07-31,iii kkk kkk ggg ggg lll eee eee lll ooo ooo nn...,36
3,2022-08-01,iii kkk kkk ggg ggg lll iii kkk kkk ggg ggg ll...,542
4,2022-08-02,iii kkk kkk ggg ggg lll iii kkk kkk ggg ggg ll...,422


In [15]:
# формируем мешок слов для каждой даты
from sklearn.feature_extraction.text import CountVectorizer

col = 'gate_words'

vectorizer = CountVectorizer(ngram_range=(1,1))

bag_of_gates = vectorizer.fit_transform(gates[col].to_list())
gates_v =  pd.concat(
    [gates.drop(col, axis=1).reset_index(drop=True),
     pd.DataFrame(
            bag_of_gates.toarray(), columns=vectorizer.get_feature_names_out()
        ) ],
    axis=1
)
gates_v

Unnamed: 0,date,checks,aaa,bbb,ccc,ddd,eee,fff,ggg,hhh,iii,jjj,kkk,lll,mmm,nnn,ooo,ppp,qqq,rrr
0,2022-07-29,505,6,0,0,0,84,108,58,33,36,0,36,54,52,14,18,0,6,0
1,2022-07-30,22,0,0,0,0,0,3,4,0,3,0,8,2,2,0,0,0,0,0
2,2022-07-31,36,0,0,0,0,2,6,4,0,2,0,8,3,3,4,4,0,0,0
3,2022-08-01,542,0,0,0,0,76,126,76,26,51,2,32,56,61,18,16,0,2,0
4,2022-08-02,422,0,0,0,0,62,92,54,24,34,0,32,44,46,16,16,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,2023-02-20,34,0,0,0,0,4,8,4,0,2,0,8,4,4,0,0,0,0,0
180,2023-02-21,36,0,0,0,0,0,8,8,0,3,0,10,3,4,0,0,0,0,0
181,2023-02-22,41,0,0,0,0,0,8,8,0,3,0,14,4,4,0,0,0,0,0
182,2023-02-23,48,0,0,0,0,6,6,2,2,3,0,12,7,4,0,6,0,0,0


In [16]:
# мерджим датасет по дате
df_merge = df.merge(gates_v)
print('df_merge.shape = ', df_merge.shape)
df_merge

df_merge.shape =  (44643, 46)


Unnamed: 0,row_id,user_id,date,weekday,time,day_id,gates,prev_time_1,next_time_1,prev_time_2,...,iii,jjj,kkk,lll,mmm,nnn,ooo,ppp,qqq,rrr
0,0,18,2022-07-29,4,32934,19202,iii,-10000.0,60.0,-10000.0,...,36,0,36,54,52,14,18,0,6,0
1,1,18,2022-07-29,4,32994,19202,kkk,60.0,0.0,-10000.0,...,36,0,36,54,52,14,18,0,6,0
2,2,18,2022-07-29,4,32994,19202,kkk,0.0,12.0,60.0,...,36,0,36,54,52,14,18,0,6,0
3,3,18,2022-07-29,4,33006,19202,ggg,12.0,2.0,12.0,...,36,0,36,54,52,14,18,0,6,0
4,4,18,2022-07-29,4,33008,19202,ggg,2.0,26.0,14.0,...,36,0,36,54,52,14,18,0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44638,44638,-7,2023-02-24,4,71016,19412,mmm,9279.0,24.0,9669.0,...,3,0,12,4,4,0,0,0,0,0
44639,44639,-7,2023-02-24,4,71040,19412,fff,24.0,1.0,9303.0,...,3,0,12,4,4,0,0,0,0,0
44640,44640,-7,2023-02-24,4,71041,19412,fff,1.0,8.0,25.0,...,3,0,12,4,4,0,0,0,0,0
44641,44641,-7,2023-02-24,4,71049,19412,kkk,8.0,0.0,9.0,...,3,0,12,4,4,0,0,0,0,0


# Предсказание CatBoost по дефолту

In [18]:
# наличие колонок 'prev_time_5','next_time_5' ухудшало предсказания
drop_collls = [ 'prev_time_5','next_time_5']
new_train = df_merge.iloc[:limit_train,:].drop(drop_collls, axis=1)
new_test = df_merge.iloc[limit_train:,:].drop(drop_collls, axis=1)
print(new_train.columns)
print(new_train.columns)
new_train

Index(['row_id', 'user_id', 'date', 'weekday', 'time', 'day_id', 'gates',
       'prev_time_1', 'next_time_1', 'prev_time_2', 'next_time_2',
       'prev_time_3', 'next_time_3', 'prev_time_4', 'next_time_4',
       'prev_gate_1', 'next_gate_1', 'prev_gate_2', 'next_gate_2',
       'prev_gate_3', 'next_gate_3', 'prev_gate_4', 'next_gate_4',
       'prev_gate_5', 'next_gate_5', 'checks', 'aaa', 'bbb', 'ccc', 'ddd',
       'eee', 'fff', 'ggg', 'hhh', 'iii', 'jjj', 'kkk', 'lll', 'mmm', 'nnn',
       'ooo', 'ppp', 'qqq', 'rrr'],
      dtype='object')
Index(['row_id', 'user_id', 'date', 'weekday', 'time', 'day_id', 'gates',
       'prev_time_1', 'next_time_1', 'prev_time_2', 'next_time_2',
       'prev_time_3', 'next_time_3', 'prev_time_4', 'next_time_4',
       'prev_gate_1', 'next_gate_1', 'prev_gate_2', 'next_gate_2',
       'prev_gate_3', 'next_gate_3', 'prev_gate_4', 'next_gate_4',
       'prev_gate_5', 'next_gate_5', 'checks', 'aaa', 'bbb', 'ccc', 'ddd',
       'eee', 'fff', 'ggg', 'hh

Unnamed: 0,row_id,user_id,date,weekday,time,day_id,gates,prev_time_1,next_time_1,prev_time_2,...,iii,jjj,kkk,lll,mmm,nnn,ooo,ppp,qqq,rrr
0,0,18,2022-07-29,4,32934,19202,iii,-10000.0,60.0,-10000.0,...,36,0,36,54,52,14,18,0,6,0
1,1,18,2022-07-29,4,32994,19202,kkk,60.0,0.0,-10000.0,...,36,0,36,54,52,14,18,0,6,0
2,2,18,2022-07-29,4,32994,19202,kkk,0.0,12.0,60.0,...,36,0,36,54,52,14,18,0,6,0
3,3,18,2022-07-29,4,33006,19202,ggg,12.0,2.0,12.0,...,36,0,36,54,52,14,18,0,6,0
4,4,18,2022-07-29,4,33008,19202,ggg,2.0,26.0,14.0,...,36,0,36,54,52,14,18,0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37513,37513,6,2022-12-31,5,74336,19357,mmm,11857.0,26.0,11878.0,...,1,0,4,1,1,0,0,0,0,0
37514,37514,6,2022-12-31,5,74362,19357,hhh,26.0,1.0,11883.0,...,1,0,4,1,1,0,0,0,0,0
37515,37515,6,2022-12-31,5,74363,19357,hhh,1.0,8.0,27.0,...,1,0,4,1,1,0,0,0,0,0
37516,37516,6,2022-12-31,5,74371,19357,kkk,8.0,0.0,9.0,...,1,0,4,1,1,0,0,0,0,0


In [19]:
# определим X y
y = new_train['user_id']
X_train = new_train.iloc[:,3:]
X_test = new_test.iloc[:,3:]
# определим категориальные  фичи
cat_cols=['gates', 'weekday', 'prev_gate_1', 'next_gate_1',
          'prev_gate_2', 'next_gate_2', 'prev_gate_3', 'next_gate_3',
          'prev_gate_4', 'next_gate_4', 'prev_gate_5', 'next_gate_5',]

model = CatBoostClassifier(random_state=42,
                           cat_features=cat_cols,
                           one_hot_max_size=20,
                           task_type="GPU",
                           devices='0:1')
model.fit(X_train,  y,
          verbose=100,
          #plot=True
         )


y_pred = model.predict(X_test)

submission = test[["row_id"]]
submission['target']=y_pred
# submission.to_csv('23_05_02_submission_005_cb.csv', index=False)    #n_gramm=(1,1)
print("\n train score ", accuracy_score(y, model.predict(X_train)))

Learning rate set to 0.135965
0:	learn: 3.7724396	total: 45ms	remaining: 44.9s
100:	learn: 2.3000368	total: 3.99s	remaining: 35.5s
200:	learn: 1.9892543	total: 7.83s	remaining: 31.1s
300:	learn: 1.8099623	total: 11.5s	remaining: 26.6s
400:	learn: 1.6652701	total: 15s	remaining: 22.5s
500:	learn: 1.5569217	total: 18.6s	remaining: 18.5s
600:	learn: 1.4689155	total: 22s	remaining: 14.6s
700:	learn: 1.3863893	total: 25.5s	remaining: 10.9s
800:	learn: 1.3036640	total: 29.1s	remaining: 7.22s
900:	learn: 1.2297612	total: 32.7s	remaining: 3.59s
999:	learn: 1.1548222	total: 36.3s	remaining: 0us

 train score  0.7530252145636761


## result  0,3161145424

# Предсказание CatBoost с изменёнными гиперпараметрами

In [22]:
# наличие колонок 'prev_time_5','next_time_5' ухудшало предсказания
drop_collls = [ 'prev_time_5','next_time_5']
new_train = df_merge.iloc[:limit_train,:].drop(drop_collls, axis=1)
new_test = df_merge.iloc[limit_train:,:].drop(drop_collls, axis=1)
print(new_train.columns)
new_train

Index(['row_id', 'user_id', 'date', 'weekday', 'time', 'day_id', 'gates',
       'prev_time_1', 'next_time_1', 'prev_time_2', 'next_time_2',
       'prev_time_3', 'next_time_3', 'prev_time_4', 'next_time_4',
       'prev_gate_1', 'next_gate_1', 'prev_gate_2', 'next_gate_2',
       'prev_gate_3', 'next_gate_3', 'prev_gate_4', 'next_gate_4',
       'prev_gate_5', 'next_gate_5', 'checks', 'aaa', 'bbb', 'ccc', 'ddd',
       'eee', 'fff', 'ggg', 'hhh', 'iii', 'jjj', 'kkk', 'lll', 'mmm', 'nnn',
       'ooo', 'ppp', 'qqq', 'rrr'],
      dtype='object')


Unnamed: 0,row_id,user_id,date,weekday,time,day_id,gates,prev_time_1,next_time_1,prev_time_2,...,iii,jjj,kkk,lll,mmm,nnn,ooo,ppp,qqq,rrr
0,0,18,2022-07-29,4,32934,19202,iii,-10000.0,60.0,-10000.0,...,36,0,36,54,52,14,18,0,6,0
1,1,18,2022-07-29,4,32994,19202,kkk,60.0,0.0,-10000.0,...,36,0,36,54,52,14,18,0,6,0
2,2,18,2022-07-29,4,32994,19202,kkk,0.0,12.0,60.0,...,36,0,36,54,52,14,18,0,6,0
3,3,18,2022-07-29,4,33006,19202,ggg,12.0,2.0,12.0,...,36,0,36,54,52,14,18,0,6,0
4,4,18,2022-07-29,4,33008,19202,ggg,2.0,26.0,14.0,...,36,0,36,54,52,14,18,0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37513,37513,6,2022-12-31,5,74336,19357,mmm,11857.0,26.0,11878.0,...,1,0,4,1,1,0,0,0,0,0
37514,37514,6,2022-12-31,5,74362,19357,hhh,26.0,1.0,11883.0,...,1,0,4,1,1,0,0,0,0,0
37515,37515,6,2022-12-31,5,74363,19357,hhh,1.0,8.0,27.0,...,1,0,4,1,1,0,0,0,0,0
37516,37516,6,2022-12-31,5,74371,19357,kkk,8.0,0.0,9.0,...,1,0,4,1,1,0,0,0,0,0


## предсказание на всём train

In [23]:
##################################################
# здесь использовался весь трайн 
####################################################
# определим X y
y = new_train['user_id']
X_train = new_train.iloc[:,3:]
X_test = new_test.iloc[:,3:]
# определим категориальные  фичи
cat_cols=['gates', 'weekday',
         'prev_gate_1', 'next_gate_1', 'prev_gate_2', 'next_gate_2',
         'prev_gate_3', 'next_gate_3' , 'prev_gate_4', 'next_gate_4',
         'prev_gate_5', 'next_gate_5',]

model = CatBoostClassifier(random_state=42,
                           cat_features=cat_cols,
                           one_hot_max_size=20,
                           iterations=1470,
                           learning_rate = 0.1,
                           custom_loss=[ 'Accuracy'],
                           task_type="GPU",
                           devices='0:1')
model.fit(X_train,  y,
          verbose=100,
          #plot=True
         )


y_pred = model.predict(X_test)

submission = test[["row_id"]]
submission['target']=y_pred
#submission.to_csv('23_04_26_submission_001_cb.csv', index=False)    #n_gramm=(1,1)
print("\n train score ", accuracy_score(y, model.predict(X_train)))

0:	learn: 3.8346744	total: 41.7ms	remaining: 1m 1s
100:	learn: 2.4360295	total: 4.14s	remaining: 56.1s
200:	learn: 2.1095497	total: 8.27s	remaining: 52.2s
300:	learn: 1.9379998	total: 12.1s	remaining: 46.9s
400:	learn: 1.8093936	total: 15.8s	remaining: 42.1s
500:	learn: 1.6972624	total: 19.5s	remaining: 37.7s
600:	learn: 1.6039576	total: 23.3s	remaining: 33.7s
700:	learn: 1.5342763	total: 26.9s	remaining: 29.5s
800:	learn: 1.4698257	total: 30.5s	remaining: 25.5s
900:	learn: 1.4053317	total: 34.2s	remaining: 21.6s
1000:	learn: 1.3421667	total: 37.9s	remaining: 17.8s
1100:	learn: 1.2825597	total: 41.6s	remaining: 14s
1200:	learn: 1.2245628	total: 45.4s	remaining: 10.2s
1300:	learn: 1.1667591	total: 49.3s	remaining: 6.4s
1400:	learn: 1.1164854	total: 53s	remaining: 2.61s
1469:	learn: 1.0812446	total: 55.6s	remaining: 0us

 train score  0.7878351724505571


## result   0,3256597417

In [28]:
# посмотрим важность фичей
model.get_feature_importance(prettified=True)
# интересно, что 4 тайминги важнее остальных
# и явна заметна важность day_id

Unnamed: 0,Feature Id,Importances
0,time,17.856574
1,day_id,15.562028
2,gates,6.226115
3,prev_time_4,4.73893
4,next_time_4,4.649492
5,prev_time_2,3.370181
6,weekday,3.321766
7,next_time_2,3.168445
8,prev_time_1,2.448818
9,prev_time_3,2.321257


## смастерим вероятностное предсказание

In [29]:
# предсказание
y_all_pred_prob = model.predict_proba(X_test)
print (y_all_pred_prob.shape)
# df с предсказанием
all_pred_prob = pd.DataFrame(y_all_pred_prob, columns=model.classes_)
print (all_pred_prob.shape)
print (all_pred_prob.columns)

(7125, 56)
(7125, 56)
Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 17, 18,
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
            36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
            53, 54, 55, 56, 57],
           dtype='int64')


## Предсказание для трайна с сентября

In [30]:
##################################################
# здесь использовался train начиная с сентября
####################################################
new_train_sept = new_train[limit_sept:]
# определим X y
y = new_train_sept['user_id']
X_train = new_train_sept.iloc[:,3:]
X_test = new_test.iloc[:,3:]
# определим категориальные  фичи
cat_cols=['gates', 'weekday', 'prev_gate_1', 'next_gate_1',
          'prev_gate_2', 'next_gate_2', 'prev_gate_3', 'next_gate_3',
          'prev_gate_4', 'next_gate_4', 'prev_gate_5', 'next_gate_5',]

model = CatBoostClassifier(random_state=42,
                           cat_features=cat_cols,
                           one_hot_max_size=20,
                           iterations=1080,
                           learning_rate = 0.1,
                           custom_loss=[ 'Accuracy'],
                           task_type="GPU",
                           devices='0:1')
model.fit(X_train,  y,
          verbose=100,
         )


y_pred = model.predict(X_test)

submission = test[["row_id"]]
submission['target']=y_pred
#submission.to_csv('23_04_26_submission_002_cb.csv', index=False)    #n_gramm=(1,1)
print("\n train score ", accuracy_score(y, model.predict(X_train)))

0:	learn: 3.8082604	total: 37ms	remaining: 39.9s
100:	learn: 2.3332389	total: 3.54s	remaining: 34.3s
200:	learn: 1.9822639	total: 7.05s	remaining: 30.8s
300:	learn: 1.7883331	total: 10.3s	remaining: 26.7s
400:	learn: 1.6400388	total: 13.5s	remaining: 22.8s
500:	learn: 1.5207137	total: 16.6s	remaining: 19.2s
600:	learn: 1.4316130	total: 19.7s	remaining: 15.7s
700:	learn: 1.3490331	total: 22.8s	remaining: 12.3s
800:	learn: 1.2739018	total: 25.9s	remaining: 9.01s
900:	learn: 1.2037200	total: 29s	remaining: 5.76s
1000:	learn: 1.1355402	total: 32.2s	remaining: 2.54s
1079:	learn: 1.0868314	total: 34.7s	remaining: 0us

 train score  0.7881136950904393


## result  0,3133071308		

## смастерим вероятностное предсказание с сентября

In [32]:
# предсказание
y_sept_pred_prob = model.predict_proba(X_test)
print(y_sept_pred_prob.shape)
# df с предсказанием
sept_pred_prob = pd.DataFrame(y_sept_pred_prob, columns=model.classes_)
print (sept_pred_prob.shape)
print (sept_pred_prob.columns)

(7125, 53)
(7125, 53)
Int64Index([ 0,  1,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 17, 18, 19,
            21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
            38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55,
            56, 57],
           dtype='int64')


# time_blending

In [33]:
# в предсказании стало меньше колонок - юзеров, а нам нужно столько же
# добавим
# проходимся по колонкам полного предсказания
# если колонки нет в сентябрьском предсказании добавим её туда с множителем 0,5
new_sept_pred_prob = sept_pred_prob.copy()
for col in all_pred_prob.columns:
    if col not in new_sept_pred_prob.columns:
        new_sept_pred_prob[col] = all_pred_prob[col]/2
print (new_sept_pred_prob.shape)
print (new_sept_pred_prob.columns)

(7125, 56)
Int64Index([ 0,  1,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 17, 18, 19,
            21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
            38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55,
            56, 57,  4, 20, 52],
           dtype='int64')


In [34]:
# общее предсказание
pred_prob = pd.DataFrame()
for col in all_pred_prob.columns:
    pred_prob[col]= (all_pred_prob[col]+ new_sept_pred_prob[col]*0.9)/2
# блендинг
def return_user(i):
    return pred_prob.columns[i]
vru = np.vectorize(return_user)
y_pred_bl = vru(pred_prob.values.argmax(axis=1))
y_pred_bl

array([15, 15, 15, ..., 15, 15, 15], dtype=int64)

In [None]:
# запись
submission = test[["row_id"]]
submission['target']=y_pred_bl
# submission.to_csv('23_05_02_submission_003_cb.csv', index=False)

## result 0,327905671