In [3]:
import numpy as np
import pandas as pd
import catboost as ctb

from operator import itemgetter
import itertools

from tqdm import tqdm
from collections import Counter
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import OrdinalEncoder

pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 150)

- The dataset comprises anonymized data from over 200,000 Instacart users, including 3+ million grocery orders. It includes order histories, product details, and order-specific information.
- **The goal** is to predict which products will be reordered in a user's next order on the Instacart grocery delivery app.  
- Submissions assessed based on their **mean F1 score**, emphasizing precision and recall.  

#### Data loading

In [21]:
aisles = pd.read_csv('/home/jupyter/datasphere/project/MLDM-2023/Exam/aisles.csv') #Contains information about different grocery store aisles and their IDs.
deprts = pd.read_csv('/home/jupyter/datasphere/project/MLDM-2023/Exam/departments.csv') #Provides details about grocery store departments and their IDs.
prods = pd.read_csv('/home/jupyter/datasphere/project/MLDM-2023/Exam/products.csv') #Contains information about various grocery products, including their names, aisle IDs, and department IDs.
prior = pd.read_csv('/home/jupyter/datasphere/project/MLDM-2023/Exam/order_products__prior.csv') #Specifies which products were purchased in each order, including whether they were reordered.
train = pd.read_csv('/home/jupyter/datasphere/project/MLDM-2023/Exam/order_products__train.csv') # --
orders = pd.read_csv('/home/jupyter/datasphere/project/MLDM-2023/Exam/orders.csv') #Tells to which set (prior, train, test) an order belongs and provides information about order details such as the day of the week and the hour of the day
subm = pd.read_csv('/home/jupyter/datasphere/project/MLDM-2023/Exam/sample_submission.csv') #Demonstrates the expected format for submitting predictions.

In [22]:
# Viewing the first 5 observations of the 'orders' dataset.
# order_id: Order identifier
# user_id: User identifier
# eval_set: To which set of evaluations this order belongs 
# (prior, train, and test)
# order_number: Sequential order number for this user 
# (1 = first, n = n-th)
# order_dow: Day of the week when the order was placed
# order_hour_of_day: Hour of the day when the order was placed
# days_since_prior: Days since the last order, maximum 30
print(orders.shape)
orders.head()

(3421083, 7)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [23]:
# Viewing the first 5 observations of the 'priors' dataset.
# order_id: Order identifier
# product_id: Product identifier
# add_to_cart_order: The order in which each product was added to the cart
# reordered: 1 - if this product was ordered by this user in the past;
# 0 - if the product is ordered by the user for the first time
print(prior.shape)
prior.head()

(32434489, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [24]:
# Viewing the first 5 observations of the 'train' dataset.
# Training data with orders (all variables are the same as in 'prior').
print(train.shape)
train.head()

(1384617, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [25]:
# Viewing the first 5 observations of the 'prods' dataset.
# product_id: Product identifier
# product_name: Product name
# aisle_id: Aisle identifier
# department_id: Department identifier
print(prods.shape)
prods.head()

(49688, 4)


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [26]:
# Viewing the first 5 observations of the 'aisles' dataset.
# aisle_id: Aisle identifier
# aisle: Aisle name
print(aisles.shape)
aisles.head()

(134, 2)


Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [27]:
# Viewing the first 5 observations of the 'deprts' dataset.
# department_id: Department identifier
# department: Department name
print(deprts.shape)
deprts.head()

(21, 2)


Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [28]:
# Viewing the first 5 observations of the 'submission' file.
print(subm.shape)
subm.head()

(75000, 2)


Unnamed: 0,order_id,products
0,17,39276 29259
1,34,39276 29259
2,137,39276 29259
3,182,39276 29259
4,257,39276 29259


##### Items

In [29]:
# Combining information about aisles and departments in the 'prods' table.

# Performing a left merge of 'prods' and 'aisles' on the 'aisle_id' key.
prods = pd.merge(prods, aisles, how='left', on='aisle_id')

# Performing a left merge of 'prods' and 'deprts' (likely 'depts') on the 'department_id' key.
prods = pd.merge(prods, deprts, how='left', on='department_id')

# Deleting the 'aisles' and 'deprts' datasets.
del aisles, deprts

In [30]:
# We will categorize certain products with specific attributes (keywords in product names)
# into separate categories. To do this, we create an auxiliary dictionary where 
# the keys represent the category code, and the dictionary values are sets of keywords
# that we will look for in the product name to assign it to the corresponding category.
category = {
    1: ('Vegan',),
    2: ('Health',),
    3: ('Wheat Free', 'Wheat-Free', 'Grain Free',
        'Grain-Free', 'Gluten-Free', 'Gluten Free'),
    4: ('Lactose Free', 'Lactose-Free', 'Milk Free', 'Milk-Free',
        'Dairy Free', 'Dairy-Free', 'Non-Dairy', 'Non Dairy'),
    5: ('Sugar Free', 'Sugar-Free', 'Less Sugar',
        'Low Sugar', 'Lower Sugar', 'Unsweetened'),
    6: ('Fat Free', 'Fat-Free', 'Lowfat', 'Low-fat', 'Low Fat',
        'Lower Fat', 'Nonfat', 'Nonfat', 'Low Calorie',
        'Low-Calorie', 'Calorie Free', 'Calorie-Free',
        'Diet', 'Reduced Fat', 'Reduced-Fat',
        'Less Fat', 'Less-Fat'),
    7: ('Caffeine Free',),
    8: ('Salt Free', 'Salt-Free', 'Less Salt', 'Less Sodium',
        'Low Sodium', 'Lower Sodium'),
    9: ('Meatless',),
    10: ('Organic', 'Preservative Free', 'Preservative-Free',
         'Non GMO', 'Non Gmo'),
    11: ('Asian',)
}

In [31]:
# We will create a 'cat_list' column in the 'prods' dataset with indicators
# of whether a product belongs to the corresponding categories. It will be
# a list of the form [k1, k2, ..., kn], where k1, k2, ..., kn are the keys
# from the 'category' dictionary if the product name contains any substring
# from the values of the 'category' dictionary. We initialize the 'cat_list'
# column with empty values.
prods['cat_list'] = ''

In [32]:
# We iterate through the elements of the 'category' dictionary in a loop.
for key, val in category.items():
    # The construct any(y in x for y in val) returns True
    # if the product name contains at least one substring
    # from the values (val) in the 'category' dictionary for the key (key).
    # The constructs ['', key][True] and ['', key][1] are equivalent,
    # and the lambda function returns the corresponding key from the dictionary (variable key)
    # if we receive True or an empty string if we receive False.
    tmp = prods['product_name'].map(
        lambda x: str(['', key][any(y in x for y in val)])
    )
    # We add the values from the resulting 'tmp' series into the 'cat_list' column,
    # separated by spaces.
    prods['cat_list'] = pd.concat(
        [prods['cat_list'], tmp], axis='columns').apply(' '.join, axis=1)

# We convert each value in the 'cat_list' column from
# a list of strings to a list of integers.
prods['cat_list'] = prods['cat_list'].map(
    lambda x: list(map(int, x.split()))
)
prods['cat_list'].head(10)

0     []
1     []
2    [5]
3     []
4     []
5     []
6     []
7     []
8     []
9     []
Name: cat_list, dtype: object

In [33]:
# We perform Ordinal Encoding of the 'cat_list' column in the 'prods' dataset.
enc = OrdinalEncoder()
# We encode the 'cat_list' column by fitting and transforming it
# to convert the categories into numerical values.
encoded = enc.fit_transform(
    np.array(prods['cat_list'].astype('str')).reshape(-1, 1)
)
# We create a new 'cat_encoded' column in the 'prods' dataset
# and populate it with the encoded values as integers.
prods['cat_encoded'] = pd.DataFrame(encoded).astype('int')
prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department,cat_list,cat_encoded
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks,[],45
1,2,All-Seasons Salt,104,13,spices seasonings,pantry,[],45
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages,[5],36
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen,[],45
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry,[],45


##### Making training sets

In [34]:
# Before splitting the data into training and testing sets, let's
# examine the number of unique orders and users in the 'orders' table.
orders.groupby(['eval_set'])[['order_id', 'user_id']].nunique()

Unnamed: 0_level_0,order_id,user_id
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1
prior,3214874,206209
test,75000,75000
train,131209,131209


In [35]:
# For each user, we will create a list of products they have
# previously ordered and save the results in the 'user_x_product' table.

# First, we add user information from the 'orders' dataset to the 'prior' dataset - the 'user_id' variable.
tmp = prior.merge(orders[['order_id', 'user_id']], 
                  how='left', on='order_id')

# Then, we select all possible existing combinations of user-product pairs
# and examine the results.
user_x_product = tmp.groupby(
    ['user_id', 'product_id'])[['user_id', 'product_id']].first()

# We reset the index to make the resulting table more readable.
user_x_product = user_x_product.reset_index(drop=True)
user_x_product.head(10)

Unnamed: 0,user_id,product_id
0,1,196
1,1,10258
2,1,10326
3,1,12427
4,1,13032
5,1,13176
6,1,14084
7,1,17122
8,1,25133
9,1,26088


In [36]:
# We will create a dataframe 'df' from the rows in the 'orders' table where 'eval_set'
# is equal to 'train' or 'test'. This dataframe will be used to create new features
# and will later be split into training and testing sets.
# We add information for each user about all the products they have previously ordered,
# taking this information from the previously formed 'user_x_product' table.

df = orders[orders['eval_set'].isin(['train', 'test'])].merge(
    user_x_product, how='left', on='user_id')

# We sort the columns in 'df' in descending order for better readability.
df.sort_index(axis=1, inplace=True, ascending=False)

# We delete unnecessary tables 'tmp' and 'user_x_product'.
del tmp, user_x_product

df.head(10)

Unnamed: 0,user_id,product_id,order_number,order_id,order_hour_of_day,order_dow,eval_set,days_since_prior_order
0,1,196,11,1187899,8,4,train,14.0
1,1,10258,11,1187899,8,4,train,14.0
2,1,10326,11,1187899,8,4,train,14.0
3,1,12427,11,1187899,8,4,train,14.0
4,1,13032,11,1187899,8,4,train,14.0
5,1,13176,11,1187899,8,4,train,14.0
6,1,14084,11,1187899,8,4,train,14.0
7,1,17122,11,1187899,8,4,train,14.0
8,1,25133,11,1187899,8,4,train,14.0
9,1,26088,11,1187899,8,4,train,14.0


In [37]:
# We are checking the number of observations for training and prediction.
df.groupby(['eval_set'])['product_id'].count()

eval_set
test     4833292
train    8474661
Name: product_id, dtype: int64

In [38]:
# In the 'train' dataset, we only have information about products that were actually ordered.
# However, for model training, we need to provide it with information not only about
# which products a user ordered (or reordered) but also about the products they did not order.
# In the training set, we have included information about products ordered by the user over the entire period.
# These are the records in the 'df' dataset where 'eval_set' is equal to 'train'.
tmp = df[df['eval_set'] == 'train']
tmp.head(10)

Unnamed: 0,user_id,product_id,order_number,order_id,order_hour_of_day,order_dow,eval_set,days_since_prior_order
0,1,196,11,1187899,8,4,train,14.0
1,1,10258,11,1187899,8,4,train,14.0
2,1,10326,11,1187899,8,4,train,14.0
3,1,12427,11,1187899,8,4,train,14.0
4,1,13032,11,1187899,8,4,train,14.0
5,1,13176,11,1187899,8,4,train,14.0
6,1,14084,11,1187899,8,4,train,14.0
7,1,17122,11,1187899,8,4,train,14.0
8,1,25133,11,1187899,8,4,train,14.0
9,1,26088,11,1187899,8,4,train,14.0


In [39]:
# For certain rows in the 'train' dataset, we know that a product was
# included in the user's last order (either ordered or reordered),
# as this information is available in the 'reordered' column of the 'train' table.
train.head(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
5,1,13176,6,0
6,1,47209,7,0
7,1,22035,8,1
8,36,39612,1,0
9,36,19660,2,1


In [40]:
tmp = tmp.merge(train[['order_id','product_id', 'reordered']],
                how='left', on=['order_id', 'product_id'])
tmp.head(10)

Unnamed: 0,user_id,product_id,order_number,order_id,order_hour_of_day,order_dow,eval_set,days_since_prior_order,reordered
0,1,196,11,1187899,8,4,train,14.0,1.0
1,1,10258,11,1187899,8,4,train,14.0,1.0
2,1,10326,11,1187899,8,4,train,14.0,
3,1,12427,11,1187899,8,4,train,14.0,
4,1,13032,11,1187899,8,4,train,14.0,1.0
5,1,13176,11,1187899,8,4,train,14.0,
6,1,14084,11,1187899,8,4,train,14.0,
7,1,17122,11,1187899,8,4,train,14.0,
8,1,25133,11,1187899,8,4,train,14.0,1.0
9,1,26088,11,1187899,8,4,train,14.0,1.0


In [41]:
y = tmp['reordered'].fillna(0)
len(y)

8474661

In [42]:
# merge prior and orders on orders_id and sort by user_id, product_id, order_number
prior = prior.merge(right=orders, how='left', on='order_id')
prior = prior.sort_values(
    by=['user_id', 'product_id', 'order_number']
).copy()

# merge prior and prods on product_id
columns = ['product_id', 'aisle_id', 'department_id', 
           'cat_list', 'cat_encoded']
prior = prior.merge(right=prods[columns], how='left', on='product_id')
prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id,cat_list,cat_encoded
0,2539329,196,1,0,1,prior,1,2,8,,77,7,[],45
1,2398795,196,1,1,1,prior,2,3,7,15.0,77,7,[],45
2,473747,196,1,1,1,prior,3,3,12,21.0,77,7,[],45
3,2254736,196,1,1,1,prior,4,4,7,29.0,77,7,[],45
4,431534,196,1,1,1,prior,5,4,15,28.0,77,7,[],45


#### Features mining

Naming of features convention:    
A - aisle    
D - department    
U - user    
P - product    
cat - category   
n - number of (X)    

##### Items and category features

In [43]:
# merge df и prods слева по ключу product_id
columns = ['product_id', 'aisle_id', 'department_id', 
           'cat_list', 'cat_encoded']
df = df.merge(prods[columns], how='left', on='product_id')

In [44]:
# We calculate the number of orders and the number of reorders for each product,
# and save the results in a temporary table 'tmp'.
tmp = prior.groupby(['product_id'])['reordered'].agg([sum, 'count'])

# We rename the columns in the 'tmp' table.
tmp = tmp.rename(
    columns={'sum': 'P_reordered', 'count': 'P_ordered'}
).reset_index()

# We calculate the ratio of reorders to the total number of orders for each product (product_id).
tmp['P_reordered_ratio'] = tmp['P_reordered'] / tmp['P_ordered']

# We add these features to the 'df' table.
df = df.merge(tmp, how='left', on='product_id')

# For each product, we calculate the number of users who ordered that product
# 1, 3, 5, 7, or 9 times. First, we count the number of orders for each product
# by each user and save the results in a temporary table 'tmp'.
tmp = prior.groupby(['product_id', 'user_id'])['order_id'].count()

# Then, in a loop, we select rows from the temporary table where users have ordered
# the product 1, 3, 5, 7, or 9 times and count the number of such users.
# We add these calculated results to the 'df' table.
for n_orders in range(1, 10, 2):
    df = df.merge(
        tmp[tmp >= n_orders]
        .groupby(['product_id'])
        .count()
        .rename('P_n_users_' + str(n_orders)),
        how='left',
        on='product_id')

# We calculate the number of orders and the number of reorders for each aisle,
# and save the results in a temporary table 'tmp'.
tmp = prior.groupby(['aisle_id'])['reordered'].agg([sum, 'count'])

# We rename the columns in the 'tmp' table.
tmp = tmp.rename(
    columns={'sum': 'A_reordered', 'count': 'A_ordered'}
).reset_index()

# We calculate the ratio of reorders to the total number of orders for each aisle (aisle_id).
tmp['A_reordered_ratio'] = tmp['A_reordered'] / tmp['A_ordered']

# We add these features to the 'df' table.
df = df.merge(tmp, how='left', on='aisle_id')

# We calculate the number of orders and the number of reorders for each department,
# and save the results in a temporary table 'tmp'.
tmp = prior.groupby(['department_id'])['reordered'].agg([sum, 'count'])

# We rename the columns in the 'tmp' table.
tmp = tmp.rename(
    columns={'sum': 'D_reordered', 'count': 'D_ordered'}
).reset_index()

# We calculate the ratio of reorders to the total number of orders for each department (department_id).
tmp['D_reordered_ratio'] = tmp['D_reordered'] / tmp['D_ordered']

# We add these features to the 'df' table.
df = df.merge(tmp, how='left', on='department_id')

##### Users features

In [45]:
# total number of user orders
tmp = prior.groupby(['user_id'])['order_id'].nunique()
tmp = tmp.rename('U_n_orders').reset_index()
df = df.merge(tmp, how='left', on='user_id')

# number of unique trade rows for the user
tmp = prior.groupby(['user_id'])['aisle_id'].nunique()
tmp = tmp.rename('UA_n_unique').reset_index()
df = df.merge(tmp, how='left', on='user_id')

# number of ordered goods, number of unique goods, ratio of the number of unique goods to the number of ordered goods 
tmp = prior.groupby(['user_id'])['product_id'].agg(['count', 'nunique'])
tmp = tmp.rename(
    columns={'count': 'U_n_items', 'nunique': 'U_unique_items'}
).reset_index()
df = df.merge(tmp, how='left', on='user_id')
df['U_unique_to_all_ratio'] = df['U_unique_items'] / df['U_n_items']

# number of days of use, average number of days between orders, ratio of days from the previous order to the average number of days between orders
tmp = orders.groupby(['user_id'])['days_since_prior_order'].agg(
    [sum, np.mean])
tmp = tmp.rename(
    columns={'sum': 'U_days_sum', 'mean': 'U_days_mean'}
).reset_index()
df = df.merge(tmp, how='left', on='user_id')
df['U_days_ratio'] = df['days_since_prior_order'] / df['U_days_mean']

# average number of items in the user's cart
df['U_avg_basket_size'] = df['U_n_items'] / df['U_n_orders']

# For each user count the number of ordered items from each 'category': 'U_n_cat_ordered'.
f = lambda x: list(itertools.chain.from_iterable(x))
tmp = prior.groupby(['user_id'])['cat_list'].apply(f)
tmp = tmp.map(lambda x: Counter(x)).rename('U_cat_ordered')
df = df.merge(tmp, how='left', on='user_id')

# write an auxiliary function that calculates how many orders the user has placed for the product that fall into the same category as the current product
def f_ordered(args):    
    keys = args[0] 
    dic = args[1]
    return sum([dic[key] for key in keys]) 

# calculate the absolute value for each user-category combination and add it to df
tmp = df.groupby(['user_id', 'cat_encoded'])[
    ['cat_list', 'U_cat_ordered']
].first()
tmp['U_n_cat_ordered'] = tmp[['cat_list', 'U_cat_ordered']].apply(
    f_ordered, axis='columns'
)
df = df.merge(tmp['U_n_cat_ordered'], how='left', 
              on=['user_id', 'cat_encoded'])

df['U_n_cat_ordered_ratio'] = df['U_n_cat_ordered'] / df['U_n_orders']

tmp = prior['cat_encoded'].value_counts()
tmp = tmp.reset_index().rename(columns={'cat_encoded': 'cat_ordered',
                                        'index': 'cat_encoded'})
df = df.merge(tmp, how='left', on='cat_encoded')

df.drop(columns=['cat_list', 'U_cat_ordered'], inplace=True)
del tmp

##### Features for user-item interaction

In [46]:
# number of product orders, average and last serial number of adding the product to the cart, serial number of the previous order
d = {'product_id': 'count',
     'add_to_cart_order': ['mean', 'last'],
     'order_number': 'max'}

c = ['product_id', 'add_to_cart_order', 'order_number']

tmp = prior.groupby(['user_id', 'product_id'])[c].agg(d)
tmp.columns = [
    'UP_' + '_'.join(col) for col in tmp.columns.to_flat_index()
]

df = df.merge(tmp, how='left', on=['user_id', 'product_id'])

# share of user's purchases containing this product
df['UP_ordered_ratio'] = df['UP_product_id_count'] / df['U_n_orders']

# difference between the sequence number of the current order and the previous order of goods
df['UP_orders_since_last'] = df['order_number'] - df['UP_order_number_max']

# calculate the number of days elapsed since the first order for each order
orders['U_days_since_first_order'] = orders.groupby(
    ['user_id'])[['days_since_prior_order']].cumsum()
orders['U_days_since_first_order'].fillna(0, inplace=True)

# calculate the number of days remaining until the last order for each order 
tmp = orders.groupby(['user_id'])[['days_since_prior_order']].sum()
tmp = tmp.rename(columns={'days_since_prior_order': 'U_days_sum'})
orders = orders.merge(tmp, how='left', on='user_id')
orders['U_days_till_last_order'] = (orders['U_days_sum'] - 
                                    orders['U_days_since_first_order'])

prior = prior.merge(
    right=orders[['order_id', 'U_days_since_first_order']], 
    how='left', on='order_id'
)

In [47]:
# relative number of items purchased 
tail = [3, 4, 5, 6, 7]
for t in tqdm(tail):
    tmp = orders.groupby(['user_id'])[['user_id', 'order_number']].tail(t)
    tmp['t'+ str(t)] = True
    prior = prior.merge(right=tmp, how='left', on=['user_id', 'order_number'])
    tmp = prior[prior['t' + str(t)] == True].groupby(
        ['user_id', 'product_id'])['product_id'].count()
    tmp = tmp.rename('UP_product_id_count_t' + str(t))
    df = df.merge(right=tmp, how='left', on=['user_id', 'product_id'])
    df['UP_product_id_count_t' + str(t)].fillna(0, inplace=True)
    df['UP_product_id_count_t' + str(t)] = df['UP_product_id_count_t' + str(t)] / df[
        'U_n_orders'
    ].map(lambda x: max(x, t))
del tmp

100%|██████████| 5/5 [02:37<00:00, 31.51s/it]


Freaquency features:

In [48]:
tmp = prior.groupby(['user_id', 'aisle_id']).agg({'aisle_id': 'count'})
tmp.columns = ['UA_count']
df = df.merge(tmp, how='left', on=['user_id', 'aisle_id'])

# share of items from this category in the total 
# the number of user's items
df['UA_ordered_ratio'] = df['UA_count'] / df['U_n_items']

# diff in days that have elapsed since  previous order for this product
tmp = orders[['user_id', 'order_number', 'U_days_till_last_order']]
df['UP_days_since_last'] = df.merge(
    tmp, how='left',
    left_on=['user_id', 'UP_order_number_max'],
    right_on=['user_id', 'order_number'])['U_days_till_last_order']

# calculate for the combination of user and product: maximum, minimum, average and median number of days between orders

tmp = prior.groupby(['user_id', 'product_id'])[
    ['user_id', 'product_id', 'order_number']
].head(1)
tmp['UP_is_first_order'] = True
prior = prior.merge(
    tmp, how='left', on=['user_id', 'product_id', 'order_number']
)
prior['UP_days_between_orders'] = prior['U_days_since_first_order'].diff()
cond = prior.UP_is_first_order != True
func = [min, max, np.mean, np.median]
tmp = prior[cond].groupby(['user_id', 'product_id'])[
    'UP_days_between_orders'
].agg(func)

dic = {'min': 'UP_DBO_min',
       'max': 'UP_DBO_max',
       'mean': 'UP_DBO_avg',
       'median': 'UP_DBO_med'}

tmp = tmp.rename(columns=dic)

df = df.merge(tmp, how='left', on=['user_id', 'product_id'])

columns = ['UP_DBO_max', 'UP_DBO_min', 'UP_DBO_avg', 'UP_DBO_med']
df[columns] = df[columns].fillna(-1)

# take the difference between the maximum, minimum, average and median number of days 
# between orders and the number of days that have elapsed since the product was ordered, 
# the difference between the minimum and maximum number of days between orders
df['UP_DSPO_DBO_max_diff'] = df['UP_DBO_max'] - df['days_since_prior_order']
df['UP_DSPO_DBO_min_diff'] = df['UP_DBO_min'] - df['days_since_prior_order']
df['UP_DSPO_DBO_avg_diff'] = df['UP_DBO_avg'] - df['days_since_prior_order']
df['UP_DSPO_DBO_med_diff'] = df['UP_DBO_med'] - df['days_since_prior_order']
df['UP_DSPO_DBO_minmax_diff'] = df['UP_DBO_max'] - df['UP_DBO_min']

In [49]:
# similar absolute and relative indicators, aggregated for the item
func = [min, max, np.mean, np.median]
tmp = df[df.UP_DBO_med != -1].groupby(
    ['product_id'])['UP_DBO_med'].agg(func)

# rename columns
dic = {'min': 'P_DBO_min',
       'max': 'P_DBO_max',
       'mean': 'P_DBO_avg',
       'median': 'P_DBO_med'}
tmp = tmp.rename(columns=dic)
df = df.merge(tmp, how='left', on='product_id')

# missing values
columns = ['P_DBO_min', 'P_DBO_max', 'P_DBO_avg', 'P_DBO_med']
df[columns] = df[columns].fillna(-1)

df['P_DSPO_DBO_max_diff'] = df['P_DBO_max'] - df['days_since_prior_order']
df['P_DSPO_DBO_min_diff'] = df['P_DBO_min'] - df['days_since_prior_order']
df['P_DSPO_DBO_avg_diff'] = df['P_DBO_avg'] - df['days_since_prior_order']
df['P_DSPO_DBO_med_diff'] = df['P_DBO_med'] - df['days_since_prior_order']
df['P_DSPO_DBO_minmax_diff'] = df['P_DBO_max'] - df['P_DBO_min']

# calculate the median day and hour of the order and compare w current order
tmp = prior.groupby(['user_id', 'product_id'])[
    ['order_hour_of_day', 'order_dow']
].agg(np.median)
tmp = tmp.rename(columns={'order_hour_of_day': 'UP_order_HOD_med', 
                          'order_dow': 'UP_order_DOW_med'})
df = df.merge(tmp, how='left', on=['user_id', 'product_id'])

# diff in HoD / DoW
func = lambda x: min(x, 24 - x)
df['UP_delta_HOD_vs_med'] = abs(df['order_hour_of_day'] - 
                                df['UP_order_HOD_med']).map(func)

func = lambda x: min(x, 7 - x)
df['UP_delta_DOW_vs_med'] = abs(df['order_dow'] - 
                                df['UP_order_DOW_med']).map(func)

Train-test split

In [50]:
drop_columns = ['eval_set', 'user_id', 'order_id', 'product_id']
info = df[['user_id', 'order_id', 'product_id', 'eval_set']]

train = df[df['eval_set'] == 'train'].drop(columns=drop_columns)
test = df[df['eval_set'] == 'test'].drop(columns=drop_columns)
print(train.shape, test.shape)

del df

(8474661, 69) (4833292, 69)


#### Model training

Based on some patterns identified in earlier (in particular, I found that the deeper the trees, the slower the learning rate should be), we will create an ensemble of CatBoost models.

In [51]:
class mixClassifier(BaseEstimator, ClassifierMixin):  
    # 3 CatBoost models
    def __init__(self, seed=0, ss=0.8): 
        self.seed = seed
        self.ss = ss
        self.models = [   
            ctb.CatBoostClassifier(depth=5, 
                                   iterations=150, 
                                   learning_rate=0.45,
                                   loss_function='Logloss',
                                   rsm=self.ss, 
                                   logging_level='Silent',
                                   random_seed=0 + self.seed), 
            
            ctb.CatBoostClassifier(depth=6, 
                                   iterations=150, 
                                   learning_rate=0.25,
                                   loss_function='Logloss', 
                                   rsm=self.ss, 
                                   logging_level='Silent',
                                   random_seed=1 + self.seed), 
            
            ctb.CatBoostClassifier(depth=7, 
                                   iterations=150, 
                                   learning_rate=0.15,
                                   loss_function='Logloss', 
                                   rsm=self.ss, 
                                   logging_level='Silent',
                                   random_seed=2 + self.seed)
        ]
        self.weights = [1, 1, 0.65]

    def fit(self, X, y=None):
        # training        
        for t, clf in enumerate(self.models):
            clf.fit(X, y)
        return self

    def predict(self, X):
        # probability calculation        
        suma = 0.0
        for t, clf in enumerate(self.models):
            a = clf.predict_proba(X)[:, 1]
            suma += (self.weights[t] * a)
        return (suma / sum(self.weights))

In [52]:
%%time

# ensemple training
clf = mixClassifier(seed=42)
clf.fit(train, y);

CPU times: user 48min 33s, sys: 32.9 s, total: 49min 6s
Wall time: 6min 32s


#### Kaggle submission

##### Function that minimize the F1-score

In [53]:
class F1Optimizer():
    def __init__(self):
        pass

    @staticmethod
    def get_expectations(P, pNone=None):
        expectations = []
        P = np.sort(P)[::-1]

        n = np.array(P).shape[0]
        DP_C = np.zeros((n + 2, n + 1))
        if pNone is None:
            pNone = (1.0 - P).prod()

        DP_C[0][0] = 1.0
        for j in range(1, n):
            DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

        for i in range(1, n + 1):
            DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
            for j in range(i + 1, n + 1):
                DP_C[i, j] = (
                    P[j - 1] * DP_C[i - 1, j - 1] + 
                    (1.0 - P[j - 1]) * DP_C[i, j - 1]
                )

        DP_S = np.zeros((2 * n + 1,))
        DP_SNone = np.zeros((2 * n + 1,))
        
        for i in range(1, 2 * n + 1):
            DP_S[i] = 1. / (1. * i)
            DP_SNone[i] = 1. / (1. * i + 1)
            
        for k in range(n + 1)[::-1]:
            f1 = 0
            f1None = 0
            for k1 in range(n + 1):
                f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
                f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
            for i in range(1, 2 * k - 1):
                DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
                DP_SNone[i] = (
                    (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
                )
            expectations.append([f1None + 2 * pNone / (2 + k), f1])

        return np.array(expectations[::-1]).T

    @staticmethod
    def maximize_expectation(P, pNone=None):
        expectations = F1Optimizer.get_expectations(P, pNone)

        ix_max = np.unravel_index(expectations.argmax(), 
                                  expectations.shape)
        max_f1 = expectations[ix_max]

        predNone = True if ix_max[0] == 0 else False
        best_k = ix_max[1]

        return best_k, predNone, max_f1

    @staticmethod
    def _F1(tp, fp, fn):
        return 2 * tp / (2 * tp + fp + fn)

    @staticmethod
    def _Fbeta(tp, fp, fn, beta=1.0):
        beta_squared = beta ** 2
        return (1.0 + beta_squared) * tp / (
            (1.0 + beta_squared) * tp + fp + beta_squared * fn
        )

# get the prediction maximizing F1-score
def get_best_prediction(items, preds, pNone=None):
    items_preds = sorted(list(zip(items, preds)), 
                         key=itemgetter(1), 
                         reverse=True)
    P = [p for i, p in items_preds]
    L = [i for i, p in items_preds]
    
    opt = F1Optimizer.maximize_expectation(P, pNone)
    best_prediction = ['None'] if opt[1] else []
    best_prediction += (L[:opt[0]])

    return ' '.join(list(map(str, best_prediction)))

In [54]:
# We create a dataframe with predictions for the test set.

# We use the .predict() method of the previously trained ensemble 'clf'
# to obtain predictions for the test set. We reset the index and rename
# the resulting series containing the predicted probabilities of reorder
# for convenience.
y_hat = pd.Series(clf.predict(test)).reset_index(
    drop=True).rename('y_hat')

# From the previously saved 'info' dataframe, we select information
# related to products and orders in the test dataset and save
# the information in separate series.
product_id = info[info['eval_set'] == 'test'][
    'product_id'].reset_index(drop=True)
order_id = info[info['eval_set'] == 'test'][
    'order_id'].reset_index(drop=True)

# We concatenate (concatenate) the three columns into one dataframe,
# 'result,' containing the order ID, product ID, and the probability
# of reorder.
result = pd.concat([order_id, product_id, y_hat], axis='columns')

# Then, we group the resulting dataframe by the 'order_id' column,
# combining product IDs and predictions into separate lists
# (using a lambda function).
result = result.groupby(['order_id'])[
    ['product_id', 'y_hat']].agg(lambda x: list(x))

# We add a 'products' column to the 'result' dataframe
# and initialize its values to None.
result['products'] = None

# We display the result.
result.head(3)

Unnamed: 0_level_0,product_id,y_hat,products
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17,"[1283, 6291, 7035, 11494, 13107, 13535, 15613,...","[0.034733594676708446, 0.10351469959293212, 0....",
34,"[651, 2361, 2596, 4031, 5134, 5242, 6317, 7035...","[0.029311779177822756, 0.03309561009308985, 0....",
137,"[311, 1335, 2078, 2326, 2661, 3951, 5025, 5114...","[0.041002150789250597, 0.018648616081461365, 0...",


In [55]:
# optimize F1-метрику
for ix in tqdm(result.index):    
    p1 = result.loc[ix, 'product_id']
    p2 = result.loc[ix, 'y_hat']
    basket = get_best_prediction(p1, p2)
    result.loc[ix, 'products'] = basket   

100%|██████████| 75000/75000 [29:18<00:00, 42.65it/s]  


In [56]:
result.head()

Unnamed: 0_level_0,product_id,y_hat,products
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17,"[1283, 6291, 7035, 11494, 13107, 13535, 15613,...","[0.034733594676708446, 0.10351469959293212, 0....",13107 21709 21463 38777 47766 26429 39275
34,"[651, 2361, 2596, 4031, 5134, 5242, 6317, 7035...","[0.029311779177822756, 0.03309561009308985, 0....",16083 47766 21137 39475 2596 43504 47792 13176...
137,"[311, 1335, 2078, 2326, 2661, 3951, 5025, 5114...","[0.041002150789250597, 0.018648616081461365, 0...",24852 23794 38689 41787 2326 5134 25890
182,"[1244, 1757, 2078, 2295, 2480, 3397, 4344, 442...","[0.046545169362112206, 0.018865505070121143, 0...",5479 39275 9337 13629 47672 11520 41149 32109 ...
257,"[1025, 2063, 2309, 4605, 4683, 6795, 8277, 103...","[0.1810865545254066, 0.10639695859459435, 0.03...",24852 49235 27966 27104 37646 29837 45013 3947...


In [57]:
result['products'].to_csv('subm_instacart.csv', index=True)