In [1]:
import pandas as pd

In [2]:
tr = pd.read_parquet('data/train.parquet')
tst = pd.read_parquet('data/test.parquet')

In [3]:
target = tr.target
tr = tr.drop(columns='target')

order_id = tst.order_id
tst = tst.drop(columns='order_id')

In [4]:
import pandas as pd
import numpy as np

def create_time_features(df, order_time_col='order_create_time', session_end_col='session_end_time'):
    """
    Создает новые временные признаки на основе колонок с временными метками.
    
    Параметры:
    df - DataFrame с данными
    order_time_col - название колонки с временем создания заказа (timestamp)
    session_end_col - название колонки с временем окончания сессии (timestamp)
    
    Возвращает:
    DataFrame с добавленными новыми признаками
    """
    # Создаем копию DataFrame чтобы избежать предупреждений
    df = df.copy()
    
    # Преобразуем колонки в datetime, если они еще не в этом формате
    df[order_time_col] = pd.to_datetime(df[order_time_col])
    df[session_end_col] = pd.to_datetime(df[session_end_col])
    
    # 1. Разница между окончанием сессии и созданием заказа (в секундах)
    df['time_to_order_seconds'] = (df[session_end_col] - df[order_time_col]).dt.total_seconds()
    
    # 2. Признаки времени создания заказа
    df['order_hour'] = df[order_time_col].dt.hour
    df['order_day_of_week'] = df[order_time_col].dt.dayofweek  # 0-понедельник, 6-воскресенье
    df['order_day_of_month'] = df[order_time_col].dt.day
    df['order_month'] = df[order_time_col].dt.month
    df['order_week_of_year'] = df[order_time_col].dt.isocalendar().week
    df['order_is_weekend'] = df[order_time_col].dt.dayofweek >= 5
    
    # 3. Признаки времени окончания сессии
    df['session_end_hour'] = df[session_end_col].dt.hour
    df['session_end_day_of_week'] = df[session_end_col].dt.dayofweek
    df['session_end_day_of_month'] = df[session_end_col].dt.day
    df['session_end_month'] = df[session_end_col].dt.month
    df['session_end_week_of_year'] = df[session_end_col].dt.isocalendar().week
    df['session_end_is_weekend'] = df[session_end_col].dt.dayofweek >= 5
    
    # 4. Разница во времени в разных единицах
    df['time_to_order_minutes'] = df['time_to_order_seconds'] / 60
    df['time_to_order_hours'] = df['time_to_order_seconds'] / 3600
    
    # 5. Временные категории (утро/день/вечер/ночь)
    bins = [-1, 6, 12, 18, 23]
    labels = ['night', 'morning', 'afternoon', 'evening']
    df['order_time_of_day'] = pd.cut(df['order_hour'], bins=bins, labels=labels)
    df['session_end_time_of_day'] = pd.cut(df['session_end_hour'], bins=bins, labels=labels)
    
    # 6. Является ли заказ сделан в тот же день, что и сессия
    df['same_day_order'] = df[order_time_col].dt.date == df[session_end_col].dt.date
    
    # 7. Сезонность (времена года)
    def get_season(month):
        if month in [12, 1, 2]:
            return 'winter'
        elif month in [3, 4, 5]:
            return 'spring'
        elif month in [6, 7, 8]:
            return 'summer'
        else:
            return 'autumn'
    
    df['order_season'] = df['order_month'].apply(get_season)
    df['session_end_season'] = df['session_end_month'].apply(get_season)
    
    return df

In [5]:
tr = create_time_features(tr)
tr = tr.drop(columns=['order_create_time', 'session_end_time'])
tr

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,time_to_order_seconds,order_hour,order_day_of_week,order_day_of_month,order_month,...,session_end_month,session_end_week_of_year,session_end_is_weekend,time_to_order_minutes,time_to_order_hours,order_time_of_day,session_end_time_of_day,same_day_order,order_season,session_end_season
0,0.798127,0.644805,0.039843,1,0.430339,-544.607,9,4,26,12,...,12.0,52,False,-9.076783,-0.151280,morning,morning,True,winter,winter
1,-0.573826,0.220081,0.020889,0,0.177593,1926.407,10,4,26,12,...,12.0,52,False,32.106783,0.535113,morning,morning,True,winter,winter
2,-0.665887,0.644805,0.039810,0,0.431897,-75.483,11,3,25,12,...,12.0,52,False,-1.258050,-0.020968,morning,morning,True,winter,winter
3,-0.284625,0.644805,0.040026,0,0.432045,831.153,14,4,26,12,...,12.0,52,False,13.852550,0.230876,afternoon,afternoon,True,winter,winter
4,-1.710108,-0.221541,0.020585,0,0.174115,-232.743,18,4,26,12,...,12.0,52,False,-3.879050,-0.064651,afternoon,afternoon,True,winter,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104590,-0.784541,0.220081,0.020901,0,0.188639,2674.990,12,1,16,12,...,12.0,51,False,44.583167,0.743053,morning,morning,True,winter,winter
104591,-0.504846,0.220081,0.029831,0,0.175063,-41689.227,11,5,8,11,...,11.0,45,False,-694.820450,-11.580341,morning,evening,False,autumn,autumn
104592,-1.710108,-0.221541,0.021167,0,0.191408,51.047,9,2,12,11,...,11.0,46,False,0.850783,0.014180,morning,morning,True,autumn,autumn
104593,-0.403279,0.644805,0.040814,0,0.452594,-304.027,17,0,10,11,...,11.0,46,False,-5.067117,-0.084452,afternoon,afternoon,True,autumn,autumn


In [6]:
tst = create_time_features(tst)
tst = tst.drop(columns=['order_create_time', 'session_end_time'])
tst

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,time_to_order_seconds,order_hour,order_day_of_week,order_day_of_month,order_month,...,session_end_month,session_end_week_of_year,session_end_is_weekend,time_to_order_minutes,time_to_order_hours,order_time_of_day,session_end_time_of_day,same_day_order,order_season,session_end_season
0,-1.710108,0.203183,0.040015,1,0.442343,-248.133,15,3,5,2,...,2.0,6,False,-4.135550,-0.068926,afternoon,afternoon,True,winter,winter
1,0.086020,0.220081,0.021101,1,0.191027,86.593,15,5,7,2,...,2.0,6,True,1.443217,0.024054,afternoon,afternoon,True,winter,winter
2,-0.438741,0.644805,0.066312,0,0.400509,-61738.523,11,4,6,2,...,2.0,6,False,-1028.975383,-17.149590,morning,afternoon,False,winter,winter
3,-1.395752,0.203183,0.083039,1,0.414590,-1010.753,11,3,5,2,...,2.0,6,False,-16.845883,-0.280765,morning,morning,True,winter,winter
4,-0.441443,0.644805,0.052902,0,0.411544,-34957.897,19,6,1,2,...,2.0,5,True,-582.631617,-9.710527,evening,morning,True,winter,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17191,-0.429658,0.644805,0.908250,0,0.196781,-630665.860,16,0,9,2,...,2.0,6,False,-10511.097667,-175.184961,afternoon,morning,False,winter,winter
17192,-1.710108,0.203183,0.040665,0,0.411543,-274.520,14,0,9,2,...,2.0,7,False,-4.575333,-0.076256,afternoon,afternoon,True,winter,winter
17193,0.086020,0.644805,0.039744,1,0.426308,-548.593,18,0,9,2,...,2.0,7,False,-9.143217,-0.152387,afternoon,afternoon,True,winter,winter
17194,-1.710108,0.203183,0.041701,0,0.433750,-5822.847,17,3,12,2,...,2.0,7,False,-97.047450,-1.617457,afternoon,afternoon,True,winter,winter


In [10]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# 1. Обработка категориальных колонок
cat_cols = [
    'order_time_of_day', 
    'session_end_time_of_day',
    'order_season',
    'session_end_season',
    'order_is_weekend',
    'session_end_is_weekend',
    'same_day_order'
]

# 2. Преобразование булевых колонок в int
for col in ['order_is_weekend', 'session_end_is_weekend', 'same_day_order']:
    tr[col] = tr[col].astype(int)
    tst[col] = tst[col].astype(int)

# 3. Обработка категориальных колонок (добавляем 'missing' как категорию)
for col in cat_cols:
    if pd.api.types.is_categorical_dtype(tr[col]):
        # Для Categorical типов
        tr[col] = tr[col].cat.add_categories('missing').fillna('missing')
        tst[col] = tst[col].cat.add_categories('missing').fillna('missing')
    else:
        # Для object/string типов
        tr[col] = tr[col].fillna('missing').astype(str)
        tst[col] = tst[col].fillna('missing').astype(str)

# 4. Получаем индексы категориальных колонок для CatBoost
cat_features_indices = [tr.columns.get_loc(col) for col in cat_cols]

# 5. Разделение данных
X_train, X_valid, y_train, y_valid = train_test_split(
    tr, target, test_size=0.2, random_state=42, stratify=target
)

# 6. Создание Pool объектов
train_pool = Pool(
    X_train, 
    y_train,
    cat_features=cat_features_indices
)

valid_pool = Pool(
    X_valid,
    y_valid,
    cat_features=cat_features_indices
)

# 7. Параметры модели (оптимизированные)
params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 3,
    'random_strength': 1,
    'bagging_temperature': 0.8,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'early_stopping_rounds': 50,
    'random_seed': 42,
    'verbose': 100
}

# 8. Обучение модели
model = CatBoostClassifier(**params)
model.fit(
    train_pool,
    eval_set=valid_pool,
    use_best_model=True
)

# 9. Оценка модели
val_pred = model.predict_proba(X_valid)[:, 1]
val_auc = roc_auc_score(y_valid, val_pred)
print(f'\nValidation AUC: {val_auc:.4f}')

# 10. Предсказание на тесте
test_pred = model.predict_proba(tst)[:, 1]

# 11. Сохранение результатов
submission = pd.DataFrame({
    'order_id': order_id,
    'target': test_pred
})
submission.to_csv('catboost_submission.csv', index=False)

# 12. Сохранение модели
model.save_model('catboost_model.cbm')

  if pd.api.types.is_categorical_dtype(tr[col]):
  if pd.api.types.is_categorical_dtype(tr[col]):
  if pd.api.types.is_categorical_dtype(tr[col]):
  if pd.api.types.is_categorical_dtype(tr[col]):
  if pd.api.types.is_categorical_dtype(tr[col]):


0:	test: 0.9408739	best: 0.9408739 (0)	total: 255ms	remaining: 4m 14s
100:	test: 0.9557622	best: 0.9557622 (100)	total: 9.71s	remaining: 1m 26s
200:	test: 0.9563820	best: 0.9563820 (196)	total: 18.6s	remaining: 1m 13s
300:	test: 0.9567440	best: 0.9567440 (300)	total: 27.3s	remaining: 1m 3s
400:	test: 0.9569409	best: 0.9569609 (393)	total: 36.2s	remaining: 54.1s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.956960858
bestIteration = 393

Shrink model to first 394 iterations.

Validation AUC: 0.9570
