In [13]:
import pandas as pd
import numpy as np
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
import random


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/sbermarket-internship-competition/sample_submission.csv
/kaggle/input/sbermarket-internship-competition/train.csv


In [14]:
train = pd.read_csv('/kaggle/input/sbermarket-internship-competition/train.csv')
test = pd.read_csv('/kaggle/input/sbermarket-internship-competition/sample_submission.csv')
train['order_completed_at'] = pd.to_datetime(train['order_completed_at'])
train.head()

Unnamed: 0,user_id,order_completed_at,cart
0,2,2015-03-22 09:25:46,399
1,2,2015-03-22 09:25:46,14
2,2,2015-03-22 09:25:46,198
3,2,2015-03-22 09:25:46,88
4,2,2015-03-22 09:25:46,157


**EDA**

In [3]:
train.describe(include='all')

Unnamed: 0,user_id,order_completed_at,cart
count,3123064.0,3123064,3123064.0
mean,7253.373,2020-04-09 01:17:00.182836992,227.3235
min,0.0,2015-03-22 09:25:46,0.0
25%,2884.0,2020-02-03 06:03:43,42.0
50%,6055.0,2020-05-19 06:35:20,146.0
75%,11172.0,2020-07-14 04:50:22,399.0
max,19999.0,2020-09-03 23:45:45,880.0
std,5337.838,,211.2867


In [None]:
print(train.shape)
print(train.isna().sum().sum())

In [None]:
train['user_id'].value_counts()

In [None]:
categs = train['cart'].value_counts()
categs.head()

In [None]:
import matplotlib.pyplot as plt

order_counts = train['user_id'].value_counts()
plt.figure(figsize=(10, 6))
plt.hist(order_counts, bins=50)
plt.xlabel('Количество заказов на пользователя')
plt.ylabel('Частота')
plt.title('Распределение количества заказов по пользователям')
plt.show()


Добавление искуственных данных

In [15]:
train = train.sort_values(by=['user_id', 'order_completed_at'])

train['target'] = 1

new_rows = []

for user_id, user_orders in train.groupby('user_id'):
    
    user_orders_by_date = user_orders.groupby('order_completed_at')
    
    all_previous_items = set()
    
    for order_date, order_group in user_orders_by_date:
        current_order_items = set(order_group['cart'])
        
        not_ordered_items = all_previous_items - current_order_items

        num_items_to_add = min(len(not_ordered_items), len(current_order_items))

        items_to_add = random.sample(not_ordered_items, num_items_to_add)

        for item in items_to_add:
            new_row = {
                'user_id': user_id,
                'cart': item,  
                'order_completed_at': order_date,
                'target': 0,
            }
            new_rows.append(new_row)

        all_previous_items.update(current_order_items)


new_data = pd.DataFrame(new_rows)

train_with_zeros = pd.concat([train, new_data], ignore_index=True)

print(train_with_zeros.head())

since Python 3.9 and will be removed in a subsequent version.
  items_to_add = random.sample(not_ordered_items, num_items_to_add)


   user_id  order_completed_at  cart  target
0        0 2020-07-19 09:59:17    20       1
1        0 2020-07-19 09:59:17    82       1
2        0 2020-07-19 09:59:17   441       1
3        0 2020-07-19 09:59:17    57       1
4        0 2020-07-19 09:59:17    14       1


In [None]:
print(train_with_zeros.shape)
print(train_with_zeros.isna().sum().sum())


**Train Validation Split**

In [16]:
cutoff_date = '2020-07-14'

# Разделим на тренировочную и валидационную выборки, используя дату третьего квартиля 
train_set = train_with_zeros[train_with_zeros['order_completed_at'] < cutoff_date].copy()
val_set = train_with_zeros[train_with_zeros['order_completed_at'] >= cutoff_date].copy()
print(f"Количество записей в train_set: {len(train_set)}")
print(f"Количество записей в val_set: {len(val_set)}")
print(f"Количество записей в test: {len(test)}")

Количество записей в train_set: 4214901
Количество записей в val_set: 1494074
Количество записей в test: 790449


**Add date/time to test validation**

In [27]:
validation_end_date = pd.Timestamp('2020-09-03 23:45:45')


train_set = train_set.sort_values(['user_id', 'order_completed_at'])
train_set['time_diff'] = train_set.groupby('user_id')['order_completed_at'].diff()


avg_order_interval = train_set['time_diff'].mean(skipna=True)
print(train_set.head())
print(train_set.isna().sum())

train_set['time_diff'] = train_set['time_diff'].fillna(avg_order_interval)

print(train_set.head())
print(train_set.isna().sum())

user_avg_interval = train_set.groupby('user_id')['time_diff'].mean().reset_index()
user_avg_interval.columns = ['user_id', 'avg_order_interval']


test = test.merge(user_avg_interval, on='user_id', how='left')

test['avg_order_interval'] = test['avg_order_interval'].fillna(avg_order_interval)

test['order_completed_at'] = validation_end_date + test['avg_order_interval']
test.head()

    user_id  order_completed_at  cart  target         time_diff
44        1 2019-05-08 16:09:41    55       1               NaT
45        1 2020-01-17 14:44:23    82       1 253 days 22:34:42
46        1 2020-01-17 14:44:23   798       1   0 days 00:00:00
47        1 2020-01-17 14:44:23    86       1   0 days 00:00:00
48        1 2020-01-17 14:44:23   421       1   0 days 00:00:00
user_id                   0
order_completed_at        0
cart                      0
target                    0
time_diff             18161
dtype: int64
    user_id  order_completed_at  cart  target                 time_diff
44        1 2019-05-08 16:09:41    55       1 0 days 14:19:30.226160305
45        1 2020-01-17 14:44:23    82       1         253 days 22:34:42
46        1 2020-01-17 14:44:23   798       1           0 days 00:00:00
47        1 2020-01-17 14:44:23    86       1           0 days 00:00:00
48        1 2020-01-17 14:44:23   421       1           0 days 00:00:00
user_id               0
order_c

KeyError: 'user_id'

**Feature Engineering**

In [None]:
test[['user_id', 'cart']] = test['id'].str.split(';', expand=True)
test['user_id'] = test['user_id'].astype(int)

print('Пользователей в train_set: ', train_set['user_id'].nunique())
print('Пользователей в val_set: ', val_set['user_id'].nunique())
print('Пользователей в test: ', test['user_id'].nunique())


val_users_not_in_train = set(val_set['user_id']) - set(train_set['user_id'])
print(f"Количество пользователей в val_set, но отсутствующих в train_set: {len(val_users_not_in_train)}")

test_users_not_in_train = set(test['user_id']) - set(train_set['user_id'])
print(f"Количество пользователей в test, но отсутствующих в train_set: {len(test_users_not_in_train)}")


Средннее количество позиций в заказах пользователя

In [None]:
order_sizes = train_set.groupby(['user_id', 'order_completed_at'])['cart'].nunique().reset_index(name='order_size')
avg_order_size = order_sizes.groupby('user_id')['order_size'].mean().reset_index(name='avg_order_size')

train_set = train_set.merge(avg_order_size, on='user_id', how='left')
val_set = val_set.merge(avg_order_size, on='user_id', how='left')

# train_set.head()
val_set.head()

In [None]:
test = test.merge(avg_order_size, on='user_id', how='left')
test['avg_order_size'].isna().sum()

**Количество уникальных категорий, которые покупал каждый пользователь**

In [None]:
user_category_counts = train_set.groupby('user_id')['cart'].nunique().reset_index()
user_category_counts.columns = ['user_id', 'unique_category_count']

train_set = train_set.merge(user_category_counts, on='user_id', how='left')
val_set = val_set.merge(user_category_counts, on='user_id', how='left')
train_set.head()

In [None]:
test = test.merge(user_category_counts, on='user_id', how='left')
test['unique_category_count'].isna().sum()
# test.head()

Последняя дата покупки для каждой пары (user_id, category_id)

In [None]:
last_order_date = train_set.groupby(['user_id', 'cart'])['order_completed_at'].max().reset_index()
last_order_date.columns = ['user_id', 'cart', 'last_order_date']

train_set = train_set.merge(last_order_date, on=['user_id', 'cart'], how='left')
val_set = val_set.merge(last_order_date, on=['user_id', 'cart'], how='left')
train_set.head()

In [None]:
last_order_date['cart'] = last_order_date['cart'].astype(str)
test['cart'] = test['cart'].astype(str)
test = test.merge(last_order_date, on=['user_id', 'cart'], how='left')
test['last_order_date'].isna().sum()

Сколько дней прошло с момента заказа этой категории 

In [None]:
train_set['days_since_last_order'] = (
    pd.to_datetime(train_set['order_completed_at']) - pd.to_datetime(train_set['last_order_date'])
).dt.days

train_set.head()  #to do

In [None]:
test_set['days_since_last_order'] = (
    test_set['order_completed_at'] - test_set['last_order_date']
).dt.days

Сколько раз пользователь заказывал данный товар

In [None]:
user_cart_counts = train_set.groupby(['user_id', 'cart']).size().reset_index(name='user_cart_counts')
train_set = train_set.merge(user_cart_counts, on=['user_id', 'cart'], how='left')

train_set.head()

In [None]:
train_set = train_set.merge(user_cart_counts, on=['user_id', 'cart'], how='left')
test_set.head()

In [None]:
# Заполнение пустых значений


default_values = {
    'avg_order_size': 0,              
    'unique_category_count': 0,       
    'last_order_date': pd.Timestamp('1900-01-01')  # Условная дата для отсутствия заказов у новых пользователей
}


val_set.fillna(default_values, inplace=True)
test.fillna(default_values, inplace=True)

In [None]:
train_set['target'].value_counts()

In [None]:
# train_set['target'] = 1

X_train = train_set.drop(columns=['target', 'order_completed_at'])
y_train = train_set['target']
X_val = val_set.drop(columns=['target', 'order_completed_at'])
y_val = val_set['target']

print(X_train.head())
print(X_val.head())

In [None]:
daily_orders = train.groupby(train_set['order_completed_at'].dt.date)['cart'].count()

# Построение линейного графика
plt.figure(figsize=(10, 5))
daily_orders.plot(kind='line')
plt.title('Количество заказов по дням')
plt.xlabel('Дата')
plt.ylabel('Количество заказов')
plt.grid(True)
plt.show()

Pipeline

In [None]:
from sklearn.preprocessing import OrdinalEncoder

numeric_features = ['avg_order_size', 'unique_category_count']
categorical_features = ['user_id', 'cart']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import xgboost as xgb


models = {
    "RandomForest": RandomForestClassifier(random_state=42, class_weight='balanced'),
    "LightGBM": lgb.LGBMClassifier(random_state=42, class_weight='balanced'),
    "XGBoost": xgb.XGBClassifier(random_state=42, scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
}


def evaluate_model(model, X_train, y_train, X_val, y_val, preprocessor):
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    
    pipeline.fit(X_train, y_train)
    
    
    y_train_pred = pipeline.predict(X_train)
    print(f"Training Classification Report for {model.__class__.__name__}:")
    print(classification_report(y_train, y_train_pred))
    
    
    y_val_pred = pipeline.predict(X_val)
    print(f"Validation Classification Report for {model.__class__.__name__}:")
    print(classification_report(y_val, y_val_pred))


for model_name, model in models.items():
    print(f"\nEvaluating model: {model_name}")
    evaluate_model(model, X_train, y_train, X_val, y_val, preprocessor)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import lightgbm as lgb


param_dist = {
    'classifier__num_leaves': [31, 50, 100],  # Количество листьев в дереве
    'classifier__max_depth': [-1, 5, 10, 20],  # Максимальная глубина деревьев
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],  # Скорость обучения
    'classifier__n_estimators': [50, 100, 200],  # Количество деревьев
    'classifier__subsample': [0.6, 0.8, 1.0],  # Часть данных для каждого дерева
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],  # Часть признаков для каждого дерева
}


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  
    ('classifier', lgb.LGBMClassifier(random_state=42, class_weight='balanced'))
])


random_search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_dist, 
    n_iter=20,                # Количество итераций поиска
    scoring='f1_weighted',    
    cv=3,                     
    random_state=42,          
    n_jobs=-1,                
    verbose=2                 
)


random_search.fit(X_train, y_train)


best_model = random_search.best_estimator_


print("Лучшие параметры:", random_search.best_params_)


y_train_pred = best_model.predict(X_train)
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred))

y_val_pred = best_model.predict(X_val)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))


In [None]:
test.head()

In [None]:
# ids = test['id']
# test = test.drop(columns=['id', 'target'])

# X_test = preprocessor.transform(test)
y_test_pred = best_model.predict(test)

submission_df = pd.DataFrame({
    'id': ids,
    'target': y_test_pred
})

print(submission_df.head())
# print(test.head())
# print(X_val.head())

In [None]:
submission_df.to_csv('submission.csv', index=False)

In [None]:
from shutil import move
move('submission.csv', '/kaggle/working/submission.csv')