In [46]:
import pandas as pd
import numpy as np

def create_time_features(df, order_time_col='order_create_time', session_end_col='session_end_time'):
    """
    Создает новые временные признаки на основе колонок с временными метками.
    
    Параметры:
    df - DataFrame с данными
    order_time_col - название колонки с временем создания заказа (timestamp)
    session_end_col - название колонки с временем окончания сессии (timestamp)
    
    Возвращает:
    DataFrame с добавленными новыми признаками
    """
    # Создаем копию DataFrame чтобы избежать предупреждений
    df = df.copy()
    
    # Преобразуем колонки в datetime, если они еще не в этом формате
    df[order_time_col] = pd.to_datetime(df[order_time_col])
    df[session_end_col] = pd.to_datetime(df[session_end_col])
    
    # 1. Разница между окончанием сессии и созданием заказа (в секундах)
    df['time_to_order_seconds'] = (df[session_end_col] - df[order_time_col]).dt.total_seconds()
    
    # 2. Признаки времени создания заказа
    df['order_hour'] = df[order_time_col].dt.hour
    df['order_day_of_week'] = df[order_time_col].dt.dayofweek  # 0-понедельник, 6-воскресенье
    df['order_day_of_month'] = df[order_time_col].dt.day
    df['order_month'] = df[order_time_col].dt.month
    df['order_week_of_year'] = df[order_time_col].dt.isocalendar().week
    df['order_is_weekend'] = df[order_time_col].dt.dayofweek >= 5
    
    # 3. Признаки времени окончания сессии
    df['session_end_hour'] = df[session_end_col].dt.hour
    df['session_end_day_of_week'] = df[session_end_col].dt.dayofweek
    df['session_end_day_of_month'] = df[session_end_col].dt.day
    df['session_end_month'] = df[session_end_col].dt.month
    df['session_end_week_of_year'] = df[session_end_col].dt.isocalendar().week
    df['session_end_is_weekend'] = df[session_end_col].dt.dayofweek >= 5
    
    # 4. Разница во времени в разных единицах
    df['time_to_order_minutes'] = df['time_to_order_seconds'] / 60
    df['time_to_order_hours'] = df['time_to_order_seconds'] / 3600
    
    # 5. Временные категории (утро/день/вечер/ночь)
    bins = [-1, 6, 12, 18, 23]
    labels = ['night', 'morning', 'afternoon', 'evening']
    df['order_time_of_day'] = pd.cut(df['order_hour'], bins=bins, labels=labels)
    df['session_end_time_of_day'] = pd.cut(df['session_end_hour'], bins=bins, labels=labels)
    
    # 6. Является ли заказ сделан в тот же день, что и сессия
    df['same_day_order'] = df[order_time_col].dt.date == df[session_end_col].dt.date
    
    # 7. Сезонность (времена года)
    def get_season(month):
        if month in [12, 1, 2]:
            return 'winter'
        elif month in [3, 4, 5]:
            return 'spring'
        elif month in [6, 7, 8]:
            return 'summer'
        else:
            return 'autumn'
    
    df['order_season'] = df['order_month'].apply(get_season)
    df['session_end_season'] = df['session_end_month'].apply(get_season)
    
    return df

In [47]:
tr = pd.read_parquet('data/train.parquet')
tst = pd.read_parquet('data/test.parquet')

target = tr.pop('target')
order_id = tst.pop('order_id')

tr = create_time_features(tr)
tst = create_time_features(tst)

# удаляем исходные метки времени
tr = tr.drop(columns=['order_create_time', 'session_end_time'])
tst = tst.drop(columns=['order_create_time', 'session_end_time'])

In [48]:
tr['target'] = target
tr

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,time_to_order_seconds,order_hour,order_day_of_week,order_day_of_month,order_month,...,session_end_week_of_year,session_end_is_weekend,time_to_order_minutes,time_to_order_hours,order_time_of_day,session_end_time_of_day,same_day_order,order_season,session_end_season,target
0,0.798127,0.644805,0.039843,1,0.430339,-544.607,9,4,26,12,...,52,False,-9.076783,-0.151280,morning,morning,True,winter,winter,1
1,-0.573826,0.220081,0.020889,0,0.177593,1926.407,10,4,26,12,...,52,False,32.106783,0.535113,morning,morning,True,winter,winter,0
2,-0.665887,0.644805,0.039810,0,0.431897,-75.483,11,3,25,12,...,52,False,-1.258050,-0.020968,morning,morning,True,winter,winter,1
3,-0.284625,0.644805,0.040026,0,0.432045,831.153,14,4,26,12,...,52,False,13.852550,0.230876,afternoon,afternoon,True,winter,winter,1
4,-1.710108,-0.221541,0.020585,0,0.174115,-232.743,18,4,26,12,...,52,False,-3.879050,-0.064651,afternoon,afternoon,True,winter,winter,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104590,-0.784541,0.220081,0.020901,0,0.188639,2674.990,12,1,16,12,...,51,False,44.583167,0.743053,morning,morning,True,winter,winter,1
104591,-0.504846,0.220081,0.029831,0,0.175063,-41689.227,11,5,8,11,...,45,False,-694.820450,-11.580341,morning,evening,False,autumn,autumn,1
104592,-1.710108,-0.221541,0.021167,0,0.191408,51.047,9,2,12,11,...,46,False,0.850783,0.014180,morning,morning,True,autumn,autumn,0
104593,-0.403279,0.644805,0.040814,0,0.452594,-304.027,17,0,10,11,...,46,False,-5.067117,-0.084452,afternoon,afternoon,True,autumn,autumn,1


In [49]:
'''mod = tr
mod['model_1'] = (mod['model_1'] - mod['model_1'].min()) / (mod['model_1'].max() - mod['model_1'].min())
mod['model_2'] = mod['model_2'].rank(pct=True)  # преобразование в процентили
mod['model_3'] = (mod['model_3'] - mod['model_3'].min()) / (mod['model_3'].max() - mod['model_3'].min())
mod['model_5'] = (mod['model_5'] - mod['model_5'].min()) / (mod['model_5'].max() - mod['model_5'].min())

mod'''

"mod = tr\nmod['model_1'] = (mod['model_1'] - mod['model_1'].min()) / (mod['model_1'].max() - mod['model_1'].min())\nmod['model_2'] = mod['model_2'].rank(pct=True)  # преобразование в процентили\nmod['model_3'] = (mod['model_3'] - mod['model_3'].min()) / (mod['model_3'].max() - mod['model_3'].min())\nmod['model_5'] = (mod['model_5'] - mod['model_5'].min()) / (mod['model_5'].max() - mod['model_5'].min())\n\nmod"

In [50]:
'''import numpy as np
from sklearn.metrics import roc_auc_score

# Границы зон
LEFT_TAIL_END = -70000
RIGHT_TAIL_START = 22000
WINDOW_SIZE = 1000
STEP = 1000

# Основная зона: скользящие окна
main_zone_models = {}
for window_start in np.arange(LEFT_TAIL_END, RIGHT_TAIL_START - WINDOW_SIZE, STEP):
    window_end = window_start + WINDOW_SIZE
    window_data = mod[mod['time_to_order_seconds'].between(window_start, window_end)]
    
    if window_data.empty or len(window_data['target'].unique()) < 2:
        continue
    
    max_auc = -1
    best_model = None
    for i in [1, 2, 3, 5]:
        col_name = f'model_{i}'
        if col_name not in window_data.columns:
            continue
        
        try:
            auc = roc_auc_score(window_data['target'], window_data[col_name])
        except ValueError:
            continue
        
        if auc > max_auc:
            max_auc = auc
            best_model = col_name
    
    if best_model:
        main_zone_models[(window_start + window_end) / 2] = best_model

# Левый хвост: одна модель на весь диапазон
left_tail_data = mod[mod['time_to_order_seconds'] < LEFT_TAIL_END]
left_tail_model = None
if not left_tail_data.empty and len(left_tail_data['target'].unique()) >= 2:
    max_auc = -1
    for i in [1, 2, 3, 5]:
        col_name = f'model_{i}'
        if col_name not in left_tail_data.columns:
            continue
        
        try:
            auc = roc_auc_score(left_tail_data['target'], left_tail_data[col_name])
        except ValueError:
            continue
        
        if auc > max_auc:
            max_auc = auc
            left_tail_model = col_name

# Правый хвост: одна модель на весь диапазон
right_tail_data = mod[mod['time_to_order_seconds'] > RIGHT_TAIL_START]
right_tail_model = None
if not right_tail_data.empty and len(right_tail_data['target'].unique()) >= 2:
    max_auc = -1
    for i in [1, 2, 3, 5]:
        col_name = f'model_{i}'
        if col_name not in right_tail_data.columns:
            continue
        
        try:
            auc = roc_auc_score(right_tail_data['target'], right_tail_data[col_name])
        except ValueError:
            continue
        
        if auc > max_auc:
            max_auc = auc
            right_tail_model = col_name
            

# Результаты
print("Main zone models:", main_zone_models)
print("Left tail model:", left_tail_model)
print("Right tail model:", right_tail_model)'''

'import numpy as np\nfrom sklearn.metrics import roc_auc_score\n\n# Границы зон\nLEFT_TAIL_END = -70000\nRIGHT_TAIL_START = 22000\nWINDOW_SIZE = 1000\nSTEP = 1000\n\n# Основная зона: скользящие окна\nmain_zone_models = {}\nfor window_start in np.arange(LEFT_TAIL_END, RIGHT_TAIL_START - WINDOW_SIZE, STEP):\n    window_end = window_start + WINDOW_SIZE\n    window_data = mod[mod[\'time_to_order_seconds\'].between(window_start, window_end)]\n    \n    if window_data.empty or len(window_data[\'target\'].unique()) < 2:\n        continue\n    \n    max_auc = -1\n    best_model = None\n    for i in [1, 2, 3, 5]:\n        col_name = f\'model_{i}\'\n        if col_name not in window_data.columns:\n            continue\n        \n        try:\n            auc = roc_auc_score(window_data[\'target\'], window_data[col_name])\n        except ValueError:\n            continue\n        \n        if auc > max_auc:\n            max_auc = auc\n            best_model = col_name\n    \n    if best_model:\n 

In [51]:
"""main_zone_models[6500] = 'model_3'
main_zone_models[7500] = 'model_3'
main_zone_models[8500] = 'model_3'
main_zone_models[9500] = 'model_3'"""

"main_zone_models[6500] = 'model_3'\nmain_zone_models[7500] = 'model_3'\nmain_zone_models[8500] = 'model_3'\nmain_zone_models[9500] = 'model_3'"

In [52]:
'''def get_best_model(time, main_zone_models, left_tail_model, right_tail_model):
    if time < LEFT_TAIL_END:
        return left_tail_model
    elif time > RIGHT_TAIL_START:
        return right_tail_model
    else:
        # Находим ближайшее окно в основной зоне
        closest_time = min(main_zone_models.keys(), key=lambda x: abs(x - time))
        return main_zone_models[closest_time]

# Пример для тестовых данных
mod['best_model'] = mod['time_to_order_seconds'].apply(
    lambda x: get_best_model(x, main_zone_models, left_tail_model, right_tail_model)
)

mod['final_prediction'] = mod.apply(lambda row: row[row['best_model']], axis=1)'''

"def get_best_model(time, main_zone_models, left_tail_model, right_tail_model):\n    if time < LEFT_TAIL_END:\n        return left_tail_model\n    elif time > RIGHT_TAIL_START:\n        return right_tail_model\n    else:\n        # Находим ближайшее окно в основной зоне\n        closest_time = min(main_zone_models.keys(), key=lambda x: abs(x - time))\n        return main_zone_models[closest_time]\n\n# Пример для тестовых данных\nmod['best_model'] = mod['time_to_order_seconds'].apply(\n    lambda x: get_best_model(x, main_zone_models, left_tail_model, right_tail_model)\n)\n\nmod['final_prediction'] = mod.apply(lambda row: row[row['best_model']], axis=1)"

In [53]:
'''from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

#mod = tr[tr['time_to_order_seconds'].between(25000, 27000)]

plt.figure(figsize=(10, 6))

hard_roc_auc = roc_auc_score(mod['target'], mod['final_prediction'])
fpr_hard, tpr_hard, _ = roc_curve(mod['target'], mod['final_prediction'] )
plt.plot(fpr_hard, tpr_hard, label=f'HARD model AUC = {hard_roc_auc:.3f}', color='orange')

# 5 "загадочных" моделей — предполагаем, что колонки называются model1 ... model5
for i in range(1, 6):
    col_name = f'model_{i}'
    if col_name in mod.columns:
        preds = mod[col_name]
        auc = roc_auc_score(mod['target'], preds)
        fpr, tpr, _ = roc_curve(mod['target'], preds)
        plt.plot(fpr, tpr, label=f'Model {i} AUC = {auc:.3f}')

# Диагональ случайного угадывания
plt.plot([0, 1], [0, 1], 'k--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve: HARD model vs 5 others')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
'''

'from sklearn.metrics import roc_auc_score, roc_curve\nimport matplotlib.pyplot as plt\n\n#mod = tr[tr[\'time_to_order_seconds\'].between(25000, 27000)]\n\nplt.figure(figsize=(10, 6))\n\nhard_roc_auc = roc_auc_score(mod[\'target\'], mod[\'final_prediction\'])\nfpr_hard, tpr_hard, _ = roc_curve(mod[\'target\'], mod[\'final_prediction\'] )\nplt.plot(fpr_hard, tpr_hard, label=f\'HARD model AUC = {hard_roc_auc:.3f}\', color=\'orange\')\n\n# 5 "загадочных" моделей — предполагаем, что колонки называются model1 ... model5\nfor i in range(1, 6):\n    col_name = f\'model_{i}\'\n    if col_name in mod.columns:\n        preds = mod[col_name]\n        auc = roc_auc_score(mod[\'target\'], preds)\n        fpr, tpr, _ = roc_curve(mod[\'target\'], preds)\n        plt.plot(fpr, tpr, label=f\'Model {i} AUC = {auc:.3f}\')\n\n# Диагональ случайного угадывания\nplt.plot([0, 1], [0, 1], \'k--\')\n\nplt.xlabel(\'False Positive Rate\')\nplt.ylabel(\'True Positive Rate\')\nplt.title(\'ROC Curve: HARD model vs 

In [54]:
'''from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

new_mod = mod[mod['time_to_order_seconds'].between(20000, 30000)]

plt.figure(figsize=(10, 6))

hard_roc_auc = roc_auc_score(new_mod['target'], new_mod['final_prediction'])
fpr_hard, tpr_hard, _ = roc_curve(new_mod['target'], new_mod['final_prediction'] )
plt.plot(fpr_hard, tpr_hard, label=f'HARD model AUC = {hard_roc_auc:.3f}', color='orange')

# 5 "загадочных" моделей — предполагаем, что колонки называются model1 ... model5
for i in range(1, 6):
    col_name = f'model_{i}'
    if col_name in new_mod.columns:
        preds = new_mod[col_name]
        auc = roc_auc_score(new_mod['target'], preds)
        fpr, tpr, _ = roc_curve(new_mod['target'], preds)
        plt.plot(fpr, tpr, label=f'Model {i} AUC = {auc:.3f}')

# Диагональ случайного угадывания
plt.plot([0, 1], [0, 1], 'k--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve: HARD model vs 5 others')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
'''

'from sklearn.metrics import roc_auc_score, roc_curve\nimport matplotlib.pyplot as plt\n\nnew_mod = mod[mod[\'time_to_order_seconds\'].between(20000, 30000)]\n\nplt.figure(figsize=(10, 6))\n\nhard_roc_auc = roc_auc_score(new_mod[\'target\'], new_mod[\'final_prediction\'])\nfpr_hard, tpr_hard, _ = roc_curve(new_mod[\'target\'], new_mod[\'final_prediction\'] )\nplt.plot(fpr_hard, tpr_hard, label=f\'HARD model AUC = {hard_roc_auc:.3f}\', color=\'orange\')\n\n# 5 "загадочных" моделей — предполагаем, что колонки называются model1 ... model5\nfor i in range(1, 6):\n    col_name = f\'model_{i}\'\n    if col_name in new_mod.columns:\n        preds = new_mod[col_name]\n        auc = roc_auc_score(new_mod[\'target\'], preds)\n        fpr, tpr, _ = roc_curve(new_mod[\'target\'], preds)\n        plt.plot(fpr, tpr, label=f\'Model {i} AUC = {auc:.3f}\')\n\n# Диагональ случайного угадывания\nplt.plot([0, 1], [0, 1], \'k--\')\n\nplt.xlabel(\'False Positive Rate\')\nplt.ylabel(\'True Positive Rate\')\n

In [55]:
tr['model_1'] = (tr['model_1'] - tr['model_1'].min()) / (tr['model_1'].max() - tr['model_1'].min())
tr['model_2'] = tr['model_2'].rank(pct=True)  # преобразование в процентили
tr['model_3'] = (tr['model_3'] - tr['model_3'].min()) / (tr['model_3'].max() - tr['model_3'].min())
tr['model_5'] = (tr['model_5'] - tr['model_5'].min()) / (tr['model_5'].max() - tr['model_5'].min())

tst['model_1'] = (tst['model_1'] - tst['model_1'].min()) / (tst['model_1'].max() - tst['model_1'].min())
tst['model_2'] = tst['model_2'].rank(pct=True)  # преобразование в процентили
tst['model_3'] = (tst['model_3'] - tst['model_3'].min()) / (tst['model_3'].max() - tst['model_3'].min())
tst['model_5'] = (tst['model_5'] - tst['model_5'].min()) / (tst['model_5'].max() - tst['model_5'].min())

In [56]:
import numpy as np
from sklearn.metrics import roc_auc_score

# Границы зон
LEFT_TAIL_END = -70000
RIGHT_TAIL_START = 22000
WINDOW_SIZE = 1000
STEP = 1000

# Основная зона: скользящие окна
main_zone_models = {}
for window_start in np.arange(LEFT_TAIL_END, RIGHT_TAIL_START - WINDOW_SIZE, STEP):
    window_end = window_start + WINDOW_SIZE
    window_data = tr[tr['time_to_order_seconds'].between(window_start, window_end)]
    
    if window_data.empty or len(window_data['target'].unique()) < 2:
        continue
    
    max_auc = -1
    best_model = None
    for i in [1, 2, 3, 5]:
        col_name = f'model_{i}'
        if col_name not in window_data.columns:
            continue
        
        try:
            auc = roc_auc_score(window_data['target'], window_data[col_name])
        except ValueError:
            continue
        
        if auc > max_auc:
            max_auc = auc
            best_model = col_name
    
    if best_model:
        main_zone_models[(window_start + window_end) / 2] = best_model

# Левый хвост: одна модель на весь диапазон
left_tail_data = tr[tr['time_to_order_seconds'] < LEFT_TAIL_END]
left_tail_model = None
if not left_tail_data.empty and len(left_tail_data['target'].unique()) >= 2:
    max_auc = -1
    for i in [1, 2, 3, 5]:
        col_name = f'model_{i}'
        if col_name not in left_tail_data.columns:
            continue
        
        try:
            auc = roc_auc_score(left_tail_data['target'], left_tail_data[col_name])
        except ValueError:
            continue
        
        if auc > max_auc:
            max_auc = auc
            left_tail_model = col_name

# Правый хвост: одна модель на весь диапазон
right_tail_data = tr[tr['time_to_order_seconds'] > RIGHT_TAIL_START]
right_tail_model = None
if not right_tail_data.empty and len(right_tail_data['target'].unique()) >= 2:
    max_auc = -1
    for i in [1, 2, 3, 5]:
        col_name = f'model_{i}'
        if col_name not in right_tail_data.columns:
            continue
        
        try:
            auc = roc_auc_score(right_tail_data['target'], right_tail_data[col_name])
        except ValueError:
            continue
        
        if auc > max_auc:
            max_auc = auc
            right_tail_model = col_name
            

# Результаты
print("Main zone models:", main_zone_models)
print("Left tail model:", left_tail_model)
print("Right tail model:", right_tail_model)

Main zone models: {-69500.0: 'model_3', -68500.0: 'model_2', -67500.0: 'model_3', -66500.0: 'model_2', -65500.0: 'model_3', -64500.0: 'model_5', -63500.0: 'model_5', -62500.0: 'model_3', -61500.0: 'model_1', -60500.0: 'model_5', -59500.0: 'model_5', -58500.0: 'model_5', -57500.0: 'model_3', -56500.0: 'model_3', -55500.0: 'model_5', -54500.0: 'model_5', -53500.0: 'model_3', -52500.0: 'model_1', -51500.0: 'model_1', -50500.0: 'model_5', -49500.0: 'model_3', -48500.0: 'model_3', -47500.0: 'model_5', -46500.0: 'model_5', -45500.0: 'model_1', -44500.0: 'model_2', -43500.0: 'model_3', -42500.0: 'model_3', -41500.0: 'model_1', -40500.0: 'model_5', -39500.0: 'model_5', -38500.0: 'model_1', -37500.0: 'model_5', -36500.0: 'model_1', -35500.0: 'model_2', -34500.0: 'model_1', -33500.0: 'model_1', -32500.0: 'model_3', -31500.0: 'model_2', -30500.0: 'model_3', -29500.0: 'model_1', -28500.0: 'model_2', -27500.0: 'model_5', -26500.0: 'model_3', -25500.0: 'model_1', -24500.0: 'model_5', -23500.0: 'mode

In [57]:
main_zone_models[6500] = 'model_3'
main_zone_models[7500] = 'model_3'
main_zone_models[8500] = 'model_3'
main_zone_models[9500] = 'model_3'

In [58]:
def get_best_model(time, main_zone_models, left_tail_model, right_tail_model):
    if time < LEFT_TAIL_END:
        return left_tail_model
    elif time > RIGHT_TAIL_START:
        return right_tail_model
    else:
        # Находим ближайшее окно в основной зоне
        closest_time = min(main_zone_models.keys(), key=lambda x: abs(x - time))
        return main_zone_models[closest_time]

# Пример для тестовых данных
tr['best_model'] = tr['time_to_order_seconds'].apply(
    lambda x: get_best_model(x, main_zone_models, left_tail_model, right_tail_model)
)

tr['final_prediction'] = tr.apply(lambda row: row[row['best_model']], axis=1)
tr

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,time_to_order_seconds,order_hour,order_day_of_week,order_day_of_month,order_month,...,time_to_order_minutes,time_to_order_hours,order_time_of_day,session_end_time_of_day,same_day_order,order_season,session_end_season,target,best_model,final_prediction
0,0.966529,0.787289,0.039867,1,0.430643,-544.607,9,4,26,12,...,-9.076783,-0.151280,morning,morning,True,winter,winter,1,model_1,0.966529
1,0.437858,0.382523,0.020901,0,0.177717,1926.407,10,4,26,12,...,32.106783,0.535113,morning,morning,True,winter,winter,0,model_3,0.020901
2,0.402382,0.787289,0.039834,0,0.432202,-75.483,11,3,25,12,...,-1.258050,-0.020968,morning,morning,True,winter,winter,1,model_1,0.402382
3,0.549299,0.787289,0.040051,0,0.432350,831.153,14,4,26,12,...,13.852550,0.230876,afternoon,afternoon,True,winter,winter,1,model_3,0.040051
4,0.000000,0.051413,0.020597,0,0.174237,-232.743,18,4,26,12,...,-3.879050,-0.064651,afternoon,afternoon,True,winter,winter,0,model_1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104590,0.356660,0.382523,0.020913,0,0.188771,2674.990,12,1,16,12,...,44.583167,0.743053,morning,morning,True,winter,winter,1,model_3,0.020913
104591,0.464438,0.382523,0.029850,0,0.175186,-41689.227,11,5,8,11,...,-694.820450,-11.580341,morning,evening,False,autumn,autumn,1,model_1,0.464438
104592,0.000000,0.051413,0.021180,0,0.191542,51.047,9,2,12,11,...,0.850783,0.014180,morning,morning,True,autumn,autumn,0,model_3,0.021180
104593,0.503576,0.787289,0.040839,0,0.452914,-304.027,17,0,10,11,...,-5.067117,-0.084452,afternoon,afternoon,True,autumn,autumn,1,model_1,0.503576


In [59]:
def get_best_model(time, main_zone_models, left_tail_model, right_tail_model,
                   left_tail_end=-70000, right_tail_start=22000):
    if time < left_tail_end:
        return left_tail_model
    elif time > right_tail_start:
        return right_tail_model
    else:
        # Находим ближайшее окно в основной зоне
        closest_center = min(main_zone_models.keys(), key=lambda x: abs(x - time))
        return main_zone_models[closest_center]

# Применение к тестовому датасету tst
# (где есть time_to_order_seconds и model_1, ..., model_5, но нет target)

tst['best_model'] = tst['time_to_order_seconds'].apply(
    lambda x: get_best_model(x, main_zone_models, left_tail_model, right_tail_model)
)

# В каждой строке подставляем предсказание соответствующей модели
tst['final_prediction'] = tst.apply(lambda row: row[row['best_model']], axis=1)

tst

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,time_to_order_seconds,order_hour,order_day_of_week,order_day_of_month,order_month,...,session_end_is_weekend,time_to_order_minutes,time_to_order_hours,order_time_of_day,session_end_time_of_day,same_day_order,order_season,session_end_season,best_model,final_prediction
0,0.000000,0.150558,0.042697,1,0.445707,-248.133,15,3,5,2,...,False,-4.135550,-0.068926,afternoon,afternoon,True,winter,winter,model_1,0.000000
1,0.692124,0.406432,0.022516,1,0.192479,86.593,15,5,7,2,...,True,1.443217,0.024054,afternoon,afternoon,True,winter,winter,model_3,0.022516
2,0.489911,0.815829,0.070756,0,0.403555,-61738.523,11,4,6,2,...,False,-1028.975383,-17.149590,morning,afternoon,False,winter,winter,model_1,0.489911
3,0.121135,0.150558,0.088604,1,0.417743,-1010.753,11,3,5,2,...,False,-16.845883,-0.280765,morning,morning,True,winter,winter,model_1,0.121135
4,0.488870,0.815829,0.056447,0,0.414674,-34957.897,19,6,1,2,...,True,-582.631617,-9.710527,evening,morning,True,winter,winter,model_1,0.488870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17191,0.493412,0.815829,0.969118,0,0.198277,-630665.860,16,0,9,2,...,False,-10511.097667,-175.184961,afternoon,morning,False,winter,winter,model_5,0.198277
17192,0.000000,0.150558,0.043391,0,0.414673,-274.520,14,0,9,2,...,False,-4.575333,-0.076256,afternoon,afternoon,True,winter,winter,model_1,0.000000
17193,0.692124,0.815829,0.042408,1,0.429551,-548.593,18,0,9,2,...,False,-9.143217,-0.152387,afternoon,afternoon,True,winter,winter,model_1,0.692124
17194,0.000000,0.150558,0.044496,0,0.437049,-5822.847,17,3,12,2,...,False,-97.047450,-1.617457,afternoon,afternoon,True,winter,winter,model_1,0.000000


In [60]:
tr = tr.drop(columns=['target'])

In [63]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

categorical_features = [
    'order_time_of_day', 
    'session_end_time_of_day',
    'order_season',
    'session_end_season',
    'order_is_weekend',
    'session_end_is_weekend',
    'same_day_order',
    'best_model'
]

for col in ['order_season', 'session_end_season', 'best_model']:
    tr[col] = tr[col].astype('category')
    tst[col] = tst[col].astype('category')

# Разделим train на train/valid
X_train, X_valid, y_train, y_valid = train_test_split(
    tr, target, test_size=0.2, random_state=42, stratify=target
)

# Создадим датасеты для LightGBM
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data, categorical_feature=categorical_features)

# Параметры модели (можно настроить)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'min_child_samples': 20,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'seed': 42,
    'early_stopping_round': 50
}

# Обучение модели
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
		lgb.early_stopping(stopping_rounds=50, verbose=True),
		lgb.log_evaluation(period=50)
	]
)

# Предсказание на валидации
val_pred = model.predict(X_valid)
val_auc = roc_auc_score(y_valid, val_pred)
print(f'Validation AUC: {val_auc:.4f}')

# Предсказание на тесте
test_pred = model.predict(tst)

# Сохранение результатов
submission = pd.DataFrame({
    'order_id': order_id,
    'target': test_pred
})


submission.to_csv('new_fuck_this_shit.csv', index=False)

Training until validation scores don't improve for 50 rounds
[50]	train's auc: 0.958079	valid's auc: 0.956021
[100]	train's auc: 0.960758	valid's auc: 0.956865
[150]	train's auc: 0.963416	valid's auc: 0.957101
[200]	train's auc: 0.965605	valid's auc: 0.957349
[250]	train's auc: 0.967349	valid's auc: 0.957333
[300]	train's auc: 0.969005	valid's auc: 0.957371
Early stopping, best iteration is:
[261]	train's auc: 0.967766	valid's auc: 0.957426
Validation AUC: 0.9574
