## 1. Data Cleaning

In [93]:
import sys
print(sys.executable)

c:\Users\anton\AppData\Local\Programs\Python\Python313\python.exe


In [94]:
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score


In [95]:
DOUBLE11_DAY = 184

DIR_1 = 'data_format1/'
DIR_2 = 'data_format2/'

PATH_TRAIN = DIR_1 + 'train_format1.csv'
PATH_TEST = DIR_1 + 'test_format1.csv'
PATH_USER_INFO = DIR_1 + 'user_info_format1.csv'
PATH_USER_LOG = DIR_1 + 'user_log_format1.csv'

In [96]:
df_train = pd.read_csv(PATH_TRAIN)    
df_test = pd.read_csv(PATH_TEST) # this data has nan for the probabilities, the task is to predict the nan values. 
df_user_info = pd.read_csv(PATH_USER_INFO)
df_user_log = pd.read_csv(PATH_USER_LOG)

In [97]:
df_test['type'] = 'test'
df_train['type'] = 'train'

df_test = df_test.drop(columns=['prob'])

df = pd.concat([df_train, df_test]) # so that we add all the features to both dataframes, and then when we train we can divide between test and train again. 

In [98]:
print(len(df_test))

261477


In [99]:
print(len(df_train))

260864


In [100]:
print(df_train.head()) 

   user_id  merchant_id  label   type
0    34176         3906      0  train
1    34176          121      0  train
2    34176         4356      1  train
3    34176         2217      0  train
4   230784         4818      0  train


In [101]:
print(df_test.head()) 

   user_id  merchant_id  type
0   163968         4605  test
1   360576         1581  test
2    98688         1964  test
3    98688         3645  test
4   295296         3361  test


In [102]:
print(df.head())

   user_id  merchant_id  label   type
0    34176         3906    0.0  train
1    34176          121    0.0  train
2    34176         4356    1.0  train
3    34176         2217    0.0  train
4   230784         4818    0.0  train


In [103]:
print(df_user_info.head()) 

   user_id  age_range  gender
0   376517        6.0     1.0
1   234512        5.0     0.0
2   344532        5.0     0.0
3   186135        5.0     0.0
4    30230        5.0     0.0


In [104]:
df_user_log.rename(columns={'seller_id' : 'merchant_id'}, inplace=True)
print(df_user_log.head())

   user_id  item_id  cat_id  merchant_id  brand_id  time_stamp  action_type
0   328862   323294     833         2882    2661.0         829            0
1   328862   844400    1271         2882    2661.0         829            0
2   328862   575153    1271         2882    2661.0         829            0
3   328862   996875    1271         2882    2661.0         829            0
4   328862  1086186    1271         1253    1049.0         829            0


In [105]:
memory_original = df_user_log.memory_usage().sum() / 2**30

In [106]:
df_user_log['brand_id'] = df_user_log['brand_id'].fillna(0) #has nan values and decimals, no values equal to 0 (can use 0 as nan)
df_user_log['time_stamp']  = (pd.to_datetime(df_user_log['time_stamp'], format='%m%d') - pd.to_datetime(df_user_log['time_stamp'].min(), format= '%m%d')).dt.days # adds uneceary year (1900) but we want the benefits of the datetime operations.
#after we get a number of days from the earliest date in the dataset. It is just an int that is easy to work with. 

In [107]:
df_user_log['user_id'] = df_user_log['user_id'].astype('int32') # reduce memory usage and speed up calculations
df_user_log['item_id'] = df_user_log['item_id'].astype('int32')
df_user_log['cat_id'] = df_user_log['cat_id'].astype('int16')
df_user_log['merchant_id'] = df_user_log['merchant_id'].astype('int16')
df_user_log['brand_id'] = df_user_log['brand_id'].astype('int16')
df_user_log['time_stamp'] = df_user_log['time_stamp'].astype('int16')
df_user_log['action_type'] = df_user_log['action_type'].astype('int8')

In [108]:
memory_optimized = round(df_user_log.memory_usage().sum() / 2**30,2)
memory_saved = round((memory_original - memory_optimized), 2)
print(f'{memory_optimized} GB ({memory_saved} GB saved)')

0.87 GB (1.99 GB saved)


In [109]:
df_user_info['gender'] = df_user_info['gender'].fillna(2) #0 for female, 1 for male, 2 and NULL for unknown.
df_user_info['gender'] = df_user_info['gender'].astype('int8')
df_user_info['age_range'] = df_user_info['age_range'].fillna(0) # 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown.
df_user_info['age_range'] = df_user_info['age_range'].astype('int8')
df_user_info['user_id'] = df_user_info['user_id'].astype('int32')

## 2. Feature Engineering

In [110]:
# features for each merchant and user
df_user_log['time_month'] = df_user_log['time_stamp'] // 30         # make a new column that represents the month
df_user_log['time_week'] = df_user_log['time_stamp'] // 7           # make a new column that represents the week

merchants = df_user_log.groupby('merchant_id')                      # Calculate total sales, number of transactions, average transaction value per merchant.
users = df_user_log.groupby('user_id')                              # Calculate statistics per user, such as total activity duration, number of actions, etc
merchants_users = df_user_log.groupby(['merchant_id', 'user_id'])   # Understand user behavior with specific merchants.


In [111]:
double11 = df_user_log[df_user_log['time_stamp'] == DOUBLE11_DAY].reset_index(drop=True) # get the data for the double 11 day
double11_merchants = double11.groupby('merchant_id') 
double11_users = double11.groupby('user_id')
double11_merchants_users = double11.groupby(['merchant_id', 'user_id'])

##### Unique merchant and user featues

In [118]:
to_merge = merchants.nunique().reset_index()
to_merge = to_merge.rename(columns={
    'item_id': 'items_merchant', 
    'cat_id': 'categories_merchant',
    'user_id': 'users_merchant',
    'brand_id': 'brands_merchant',
    'time_stamp': 'dates_merchant',
    'time_period': 'periods_merchant',
    'action_type': 'action_types_merchant'
    })
df = df.merge(to_merge, on='merchant_id', how='left')

to_merge = users.nunique().reset_index()            # 5 minutes. summary of unique counts per user
to_merge = to_merge.rename(columns={
    'item_id': 'items_user', 
    'cat_id': 'categories_user',
    'merchant_id': 'merchants_user',
    'brand_id': 'brands_user',
    'time_stamp': 'dates_user',
    'time_month': 'months_user',
    'time_week': 'weeks_user',
    'action_type': 'action_types_user'
})
df = df.merge(to_merge, on='user_id', how='left')

to_merge = merchants_users.nunique().reset_index()
to_merge = to_merge.rename(columns={
    'item_id': 'items_user_merchant', 
    'cat_id': 'categories_user_merchant',
    'brand_id': 'brands_user_merchant',
    'time_stamp': 'dates_user_merchant',
    'time_month': 'months_merchant',
    'time_week': 'weeks_merchant',
    'action_type': 'action_types_user_merchant'
    })
df = df.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

to_merge = users['action_type'].value_counts().unstack(fill_value=0)
to_merge = to_merge.rename(columns={
    0: 'clicks_user',
    1: 'add_to_carts_user',
    2: 'purchases_user',
    3: 'add_to_favorites_user'
})
df = df.merge(to_merge, on='user_id', how='left')   

to_merge = merchants['action_type'].value_counts().unstack(fill_value=0)
to_merge = to_merge.rename(columns={
    0: 'clicks_merchant',
    1: 'add_to_carts_merchant',
    2: 'purchases_merchant',
    3: 'add_to_favorites_merchant'
})
df = df.merge(to_merge, on='merchant_id', how='left')

to_merge = merchants_users['action_type'].value_counts().unstack(fill_value=0)
to_merge = to_merge.rename(columns={
    0: 'clicks_user_merchant',
    1: 'add_to_carts_user_merchant',
    2: 'purchases_user_merchant',
    3: 'add_to_favorites_user_merchant'
})
df = df.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

##### Ratio features

In [119]:
EPS = 1e-8

# ratio of actions from a merchant perspective
df['clicks_in_user_ratio'] = df['clicks_user_merchant'] / (df['clicks_merchant'] + EPS)
df['carts_in_user_ratio'] = df['add_to_carts_user_merchant'] / (df['add_to_carts_merchant'] + EPS)
df['purchases_in_user_ratio'] = df['purchases_user_merchant'] / (df['purchases_merchant'] + EPS)
df['favourites_in_user_ratio'] = df['add_to_favorites_user_merchant'] / (df['add_to_favorites_merchant'] + EPS)

# ratio of actions from a user perspective
df['clicks_in_merchant_ratio'] = df['clicks_user_merchant'] / (df['clicks_user'] + EPS)
df['carts_in_merchant_ratio'] = df['add_to_carts_user_merchant'] / (df['add_to_carts_user'] + EPS)
df['purchases_in_merchant_ratio'] = df['purchases_user_merchant'] / (df['purchases_user'] + EPS)
df['favourites_in_merchant_ratio'] = df['add_to_favorites_user_merchant'] / (df['add_to_favorites_user'] + EPS)

# ratio of action types for every merchant
df['temporary_total_actions_merchant'] = (df['clicks_merchant'] + df['add_to_carts_merchant'] + df['purchases_merchant'] + df['add_to_favorites_merchant'] + EPS)
df['clicks_ratio_merchant'] = df['clicks_merchant'] / (df['temporary_total_actions_merchant'])
df['carts_ratio_merchant'] = df['add_to_carts_merchant'] / (df['temporary_total_actions_merchant'])
df['purchases_ratio_merchant'] = df['purchases_merchant'] / (df['temporary_total_actions_merchant'])
df['favourites_ratio_merchant'] = df['add_to_favorites_merchant'] / (df['temporary_total_actions_merchant'])
df.drop('temporary_total_actions_merchant', axis=1, inplace=True)

# ratio of action types for every user
df['temporary_total_actions_user'] = (df['clicks_user'] + df['add_to_carts_user'] + df['purchases_user'] + df['add_to_favorites_user'] + EPS)
df['clicks_ratio_user'] = df['clicks_user'] / (df['temporary_total_actions_user'])
df['carts_ratio_user'] = df['add_to_carts_user'] / (df['temporary_total_actions_user'])
df['purchases_ratio_user'] = df['purchases_user'] / (df['temporary_total_actions_user'])
df['favourites_ratio_user'] = df['add_to_favorites_user'] / (df['temporary_total_actions_user'])
df.drop('temporary_total_actions_user', axis=1, inplace=True)

# ratio of action types for every user-merchant pair
df['temporary_total_actions_user_merchant'] = (df['clicks_user_merchant'] + df['add_to_carts_user_merchant'] + df['purchases_user_merchant'] + df['add_to_favorites_user_merchant'] + EPS)
df['clicks_ratio_user_merchant'] = df['clicks_user_merchant'] / (df['temporary_total_actions_user_merchant'])
df['carts_ratio_user_merchant'] = df['add_to_carts_user_merchant'] / (df['temporary_total_actions_user_merchant'])
df['purchases_ratio_user_merchant'] = df['purchases_user_merchant'] / (df['temporary_total_actions_user_merchant'])
df['favourites_ratio_user_merchant'] = df['add_to_favorites_user_merchant'] / (df['temporary_total_actions_user_merchant'])
df.drop('temporary_total_actions_user_merchant', axis=1, inplace=True)

In [112]:
# 1. total_repeat_purchases_user

# Count how many items a user purchased multiple times
user_purchases = df_user_log[df_user_log['action_type'] == 2]
user_item_purchase_counts = user_purchases.groupby(['user_id', 'item_id']).size().reset_index(name='purchase_count')
repeat_counts = user_item_purchase_counts[user_item_purchase_counts['purchase_count'] > 1].groupby('user_id')['item_id'].size().reset_index(name='total_repeat_purchases_user')
df = df.merge(repeat_counts, on='user_id', how='left').fillna({'total_repeat_purchases_user': 0})

# 2. is_top_merchant_user (1 if this merchant is the top one for that user by total interactions)
user_merchant_interactions = merchants_users.size().reset_index(name='interaction_count')
top_merchant_user_idx = user_merchant_interactions.loc[user_merchant_interactions.groupby('user_id')['interaction_count'].idxmax()]
top_merchant_user_idx['is_top_merchant_user'] = 1
df = df.merge(top_merchant_user_idx[['user_id', 'merchant_id', 'is_top_merchant_user']], on=['user_id', 'merchant_id'], how='left').fillna({'is_top_merchant_user':0})

# 3. category_overlap_user_merchant (40 sek)
user_categories = df_user_log.groupby('user_id')['cat_id'].apply(set).reset_index(name='user_cats_set')
merchant_categories = df_user_log.groupby('merchant_id')['cat_id'].apply(set).reset_index(name='merchant_cats_set')
df = df.merge(user_categories, on='user_id', how='left')
df = df.merge(merchant_categories, on='merchant_id', how='left')
df['category_overlap_user_merchant'] = df.apply(                                                    # Calculate overlap
    lambda row: len(row['user_cats_set'].intersection(row['merchant_cats_set'])) 
    if pd.notnull(row['user_cats_set']) and pd.notnull(row['merchant_cats_set']) else 0, axis=1
)
df = df.drop(['user_cats_set', 'merchant_cats_set'], axis=1) 

# Get unique brands per user
user_brands = df_user_log.groupby('user_id')['brand_id'].apply(set).reset_index(name='user_brands_set')
merchant_brands = df_user_log.groupby('merchant_id')['brand_id'].apply(set).reset_index(name='merchant_brands_set')
df = df.merge(user_brands, on='user_id', how='left')
df = df.merge(merchant_brands, on='merchant_id', how='left')
df['brand_overlap_user_merchant'] = df.apply(
    lambda row: len(row['user_brands_set'].intersection(row['merchant_brands_set'])) 
    if pd.notnull(row['user_brands_set']) and pd.notnull(row['merchant_brands_set']) else 0, axis=1
)
df = df.drop(['user_brands_set', 'merchant_brands_set'], axis=1)


# # Reference date (assuming the latest timestamp in the dataset)
# reference_date = df_user_log['time_stamp'].max()
# user_merchant_purchases = df_user_log[df_user_log['action_type'] == 2]
# last_purchase = user_merchant_purchases.groupby(['user_id', 'merchant_id'])['time_stamp'].max().reset_index(name='last_purchase_date_user_merchant')
# df = df.merge(last_purchase, on=['user_id', 'merchant_id'], how='left')
# df['R_recency_user_merchant'] = (reference_date - df_user_log['last_purchase_date_user_merchant'])
# df['R_recency_user_merchant'] = df['R_recency_user_merchant'].fillna(-1)  # or another appropriate value





##### Interval feature

In [113]:
to_merge = (users['time_stamp'].max() - users['time_stamp'].min()).rename('action_interval')
df = df.merge(to_merge, on='user_id', how='left')

## 3. Training

In [121]:
# df_train_small = df.sample(frac=1, random_state=42)

df_train_features, df_test_features = df[df['type'] == 'train'], df[df['type'] == 'test']

df_train_features = df_train_features.drop(columns='type')
df_test_features = df_test_features.drop(columns='type')

X, y = df_train_features.drop(columns='label'), df_train_features['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# define the model parameters
params = {
    'max_depth': 7,
    'n_estimators': 2000,
    'min_child_weight': 200,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'eta': 0.04,
    'objective': 'binary:logistic',
    'eval_metric': ['auc', 'logloss'], #ensures the model optimizes performance based on AUC, a robust metric for classification tasks with imbalanced data
    'seed': 42,
    'early_stopping_rounds': 50
}

# train the model
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train,eval_set=[(X_test, y_test)], verbose=True)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error: %.2f" % mse)



[0]	validation_0-auc:0.63507	validation_0-logloss:0.26318
[1]	validation_0-auc:0.64587	validation_0-logloss:0.26069
[2]	validation_0-auc:0.65538	validation_0-logloss:0.25828
[3]	validation_0-auc:0.65388	validation_0-logloss:0.25605
[4]	validation_0-auc:0.65486	validation_0-logloss:0.25396
[5]	validation_0-auc:0.65601	validation_0-logloss:0.25195
[6]	validation_0-auc:0.65731	validation_0-logloss:0.25006
[7]	validation_0-auc:0.65738	validation_0-logloss:0.24830
[8]	validation_0-auc:0.65724	validation_0-logloss:0.24662
[9]	validation_0-auc:0.65726	validation_0-logloss:0.24506
[10]	validation_0-auc:0.65767	validation_0-logloss:0.24356
[11]	validation_0-auc:0.65735	validation_0-logloss:0.24215
[12]	validation_0-auc:0.65800	validation_0-logloss:0.24078
[13]	validation_0-auc:0.65798	validation_0-logloss:0.23951
[14]	validation_0-auc:0.65756	validation_0-logloss:0.23834
[15]	validation_0-auc:0.65766	validation_0-logloss:0.23723
[16]	validation_0-auc:0.65805	validation_0-logloss:0.23617
[17]	va

## 4. Predicting 

In [177]:
features, labels = df_test_features.drop(columns='label'), df_test_features['label']

pred = model.predict(features)


print(len(pred))

pred.mean()



261477


np.float64(3.059542521904412e-05)