## 1. Data Cleaning

In [15]:
import pandas as pd
import numpy as np

In [3]:
DOUBLE11_DAY = 184

DIR_1 = 'data_format1/'
DIR_2 = 'data_format2/'

PATH_TRAIN = DIR_1 + 'train_format1.csv'
PATH_TEST = DIR_1 + 'test_format1.csv'
PATH_USER_INFO = DIR_1 + 'user_info_format1.csv'
PATH_USER_LOG = DIR_1 + 'user_log_format1.csv'

In [4]:
df_train = pd.read_csv(PATH_TRAIN)    
df_test = pd.read_csv(PATH_TEST) # this data has nan for the probabilities, the task is to predict the nan values. 
df_user_info = pd.read_csv(PATH_USER_INFO)
df_user_log = pd.read_csv(PATH_USER_LOG)

In [5]:
print(df_train.head()) 

   user_id  merchant_id  label
0    34176         3906      0
1    34176          121      0
2    34176         4356      1
3    34176         2217      0
4   230784         4818      0


In [6]:
print(df_test.head()) 

   user_id  merchant_id  prob
0   163968         4605   NaN
1   360576         1581   NaN
2    98688         1964   NaN
3    98688         3645   NaN
4   295296         3361   NaN


In [7]:
print(df_user_info.head()) 

   user_id  age_range  gender
0   376517        6.0     1.0
1   234512        5.0     0.0
2   344532        5.0     0.0
3   186135        5.0     0.0
4    30230        5.0     0.0


In [8]:
df_user_log.rename(columns={'seller_id' : 'merchant_id'}, inplace=True)
print(df_user_log.head())

   user_id  item_id  cat_id  merchant_id  brand_id  time_stamp  action_type
0   328862   323294     833         2882    2661.0         829            0
1   328862   844400    1271         2882    2661.0         829            0
2   328862   575153    1271         2882    2661.0         829            0
3   328862   996875    1271         2882    2661.0         829            0
4   328862  1086186    1271         1253    1049.0         829            0


In [9]:
memory_original = df_user_log.memory_usage().sum() / 2**30

In [10]:
df_user_log['brand_id'].fillna(0, inplace=True) #has nan values and decimals, no values equal to 0 (can use 0 as nan)
df_user_log['time_stamp']  = (pd.to_datetime(df_user_log['time_stamp'], format='%m%d') - pd.to_datetime(df_user_log['time_stamp'].min(), format= '%m%d')).dt.days # adds uneceary year (1900) but we want the benefits of the datetime operations.
#after we get a number of days from the earliest date in the dataset. It is just an int that is easy to work with. 

In [12]:
df_user_log['user_id'] = df_user_log['user_id'].astype('int32') # reduce memory usage and speed up calculations
df_user_log['item_id'] = df_user_log['item_id'].astype('int32')
df_user_log['cat_id'] = df_user_log['cat_id'].astype('int16')
df_user_log['merchant_id'] = df_user_log['merchant_id'].astype('int16')
df_user_log['brand_id'] = df_user_log['brand_id'].astype('int16')
df_user_log['time_stamp'] = df_user_log['time_stamp'].astype('int16')
df_user_log['action_type'] = df_user_log['action_type'].astype('int8')

In [13]:
memory_optimized = round(df_user_log.memory_usage().sum() / 2**30,2)
memory_saved = round((memory_original - memory_optimized), 2)
print(f'{memory_optimized} GB ({memory_saved} GB saved)')

0.87 GB (1.99 GB saved)


In [17]:
df_user_info['gender'].fillna(2, inplace=True) #0 for female, 1 for male, 2 and NULL for unknown.
df_user_info['gender'] = df_user_info['gender'].astype('int8')
df_user_info['age_range'].fillna(0, inplace=True) # 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown.
df_user_info['age_range'] = df_user_info['age_range'].astype('int8')
df_user_info['user_id'] = df_user_info['user_id'].astype('int32')

## 2. Feature Engineering

In [28]:
# features for each merchant and user
df_user_log['time_month'] = df_user_log['time_stamp'] // 30         # make a new column that represents the month
df_user_log['time_week'] = df_user_log['time_stamp'] // 7           # make a new column that represents the week

merchants = df_user_log.groupby('merchant_id')                      # Calculate total sales, number of transactions, average transaction value per merchant.
users = df_user_log.groupby('user_id')                              # Calculate statistics per user, such as total activity duration, number of actions, etc
merchants_users = df_user_log.groupby(['merchant_id', 'user_id'])   # Understand user behavior with specific merchants.


In [None]:
double11 = df_user_log[df_user_log['time_stamp'] == DOUBLE11_DAY].reset_index(drop=True) # get the data for the double 11 day
double11_merchants = double11.groupby('merchant_id') 
double11_users = double11.groupby('user_id')
double11_merchants_users = double11.groupby(['merchant_id', 'user_id'])

##### Unique merchant and user featues

In [30]:
to_merge = merchants.nunique().reset_index()
to_merge = to_merge.rename(columns={
    'item_id': 'items_merchant', 
    'cat_id': 'categories_merchant',
    'user_id': 'users_merchant',
    'brand_id': 'brands_merchant',
    'time_stamp': 'dates_merchant',
    'time_period': 'periods_merchant',
    'action_type': 'action_types_merchant'
    })
df_train = df_train.merge(to_merge, on='merchant_id', how='left')

to_merge = users.nunique().reset_index()            # 5 minutes. summary of unique counts per user
to_merge = to_merge.rename(columns={
    'item_id': 'items_user', 
    'cat_id': 'categories_user',
    'merchant_id': 'merchants_user',
    'brand_id': 'brands_user',
    'time_stamp': 'dates_user',
    'time_month': 'months_user',
    'time_week': 'weeks_user',
    'action_type': 'action_types_user'
})
df_train = df_train.merge(to_merge, on='user_id', how='left')

to_merge = merchants_users.nunique().reset_index()
to_merge = to_merge.rename(columns={
    'item_id': 'items_user_merchant', 
    'cat_id': 'categories_user_merchant',
    'brand_id': 'brands_user_merchant',
    'time_stamp': 'dates_user_merchant',
    'time_period': 'periods_user_merchant',
    'action_type': 'action_types_user_merchant'
    })
df_train = df_train.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

to_merge = users['action_type'].value_counts().unstack(fill_value=0)
to_merge = to_merge.rename(columns={
    0: 'clicks_user',
    1: 'add_to_carts_user',
    2: 'purchases_user',
    3: 'add_to_favorites_user'
})
df_train = df_train.merge(to_merge, on='user_id', how='left')   

to_merge = merchants['action_type'].value_counts().unstack(fill_value=0)
to_merge = to_merge.rename(columns={
    0: 'clicks_merchant',
    1: 'add_to_carts_merchant',
    2: 'purchases_merchant',
    3: 'add_to_favorites_merchant'
})
df_train = df_train.merge(to_merge, on='merchant_id', how='left')

to_merge = merchants_users['action_type'].value_counts().unstack(fill_value=0)
to_merge = to_merge.rename(columns={
    0: 'clicks_user_merchant',
    1: 'add_to_carts_user_merchant',
    2: 'purchases_user_merchant',
    3: 'add_to_favorites_user_merchant'
})
df_train = df_train.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

##### Ratio features

In [None]:
EPS = 1e-8

# ratio of actions from a merchant perspective
df_train['clicks_in_user_ratio'] = df_train['clicks_user_merchant'] / (df_train['clicks_merchant'] + EPS)
df_train['carts_in_user_ratio'] = df_train['add_to_carts_user_merchant'] / (df_train['add_to_carts_merchant'] + EPS)
df_train['purchases_in_user_ratio'] = df_train['purchases_user_merchant'] / (df_train['purchases_merchant'] + EPS)
df_train['favourites_in_user_ratio'] = df_train['add_to_favorites_user_merchant'] / (df_train['add_to_favorites_merchant'] + EPS)

# ratio of actions from a user perspective
df_train['clicks_in_merchant_ratio'] = df_train['clicks_user_merchant'] / (df_train['clicks_user'] + EPS)
df_train['carts_in_merchant_ratio'] = df_train['add_to_carts_user_merchant'] / (df_train['add_to_carts_user'] + EPS)
df_train['purchases_in_merchant_ratio'] = df_train['purchases_user_merchant'] / (df_train['purchases_user'] + EPS)
df_train['favourites_in_merchant_ratio'] = df_train['add_to_favorites_user_merchant'] / (df_train['add_to_favorites_user'] + EPS)

# ratio of action types for every merchant
df_train['temporary_total_actions_merchant'] = (df_train['clicks_merchant'] + df_train['add_to_carts_merchant'] + df_train['purchases_merchant'] + df_train['add_to_favorites_merchant'] + EPS)
df_train['clicks_ratio_merchant'] = df_train['clicks_merchant'] / (df_train['temporary_total_actions_merchant'])
df_train['carts_ratio_merchant'] = df_train['add_to_carts_merchant'] / (df_train['temporary_total_actions_merchant'])
df_train['purchases_ratio_merchant'] = df_train['purchases_merchant'] / (df_train['temporary_total_actions_merchant'])
df_train['favourites_ratio_merchant'] = df_train['add_to_favorites_merchant'] / (df_train['temporary_total_actions_merchant'])
df_train.drop('temporary_total_actions_merchant', axis=1, inplace=True)

# ratio of action types for every user
df_train['temporary_total_actions_user'] = (df_train['clicks_user'] + df_train['add_to_carts_user'] + df_train['purchases_user'] + df_train['add_to_favorites_user'] + EPS)
df_train['clicks_ratio_user'] = df_train['clicks_user'] / (df_train['temporary_total_actions_user'])
df_train['carts_ratio_user'] = df_train['add_to_carts_user'] / (df_train['temporary_total_actions_user'])
df_train['purchases_ratio_user'] = df_train['purchases_user'] / (df_train['temporary_total_actions_user'])
df_train['favourites_ratio_user'] = df_train['add_to_favorites_user'] / (df_train['temporary_total_actions_user'])
df_train.drop('temporary_total_actions_user', axis=1, inplace=True)

# ratio of action types for every user-merchant pair
df_train['temporary_total_actions_user_merchant'] = (df_train['clicks_user_merchant'] + df_train['add_to_carts_user_merchant'] + df_train['purchases_user_merchant'] + df_train['add_to_favorites_user_merchant'] + EPS)
df_train['clicks_ratio_user_merchant'] = df_train['clicks_user_merchant'] / (df_train['temporary_total_actions_user_merchant'])
df_train['carts_ratio_user_merchant'] = df_train['add_to_carts_user_merchant'] / (df_train['temporary_total_actions_user_merchant'])
df_train['purchases_ratio_user_merchant'] = df_train['purchases_user_merchant'] / (df_train['temporary_total_actions_user_merchant'])
df_train['favourites_ratio_user_merchant'] = df_train['add_to_favorites_user_merchant'] / (df_train['temporary_total_actions_user_merchant'])
df_train.drop('temporary_total_actions_user_merchant', axis=1, inplace=True)

##### Interval feature

In [None]:
to_merge = (users['time_stamp'].max() - users['time_stamp'].min()).rename('action_interval')
df_train = df_train.merge(to_merge, on='user_id', how='left')

## 3. Training

## 4. Predicting 