In [None]:
#import the needed libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
from IPython import display
import pickle
color = sns.color_palette()
import warnings
warnings.filterwarnings('ignore') #Supress unnecessary warnings for readability and cleaner presentation
import pickle
pd.set_option('display.float_format', lambda x: '%.3f' % x) #Limiting floats output to 3 decimal points
import gc
print (gc.isenabled())

from subprocess import check_output
#print(check_output(['dir', 'input/']).decode("utf8")) #check the files available in the directory

In [None]:
def multilabel_fscore(y_true, y_pred):
    """
    ex1:
    y_true = [1, 2, 3]
    y_pred = [2, 3]
    return: 0.8
    
    ex2:
    y_true = ["None"]
    y_pred = [2, "None"]
    return: 0.666
    
    ex3:
    y_true = [4, 5, 6, 7]
    y_pred = [2, 4, 8, 9]
    return: 0.25
    
    """
    y_true, y_pred = set(y_true), set(y_pred)
    
    correct = sum([1 for i in y_pred if i in y_true])
    
    if correct > 0:
        
        precision = correct / len(y_pred)

        recall =    correct / len(y_true)
        
        F1_score = (2 * precision * recall) / (precision + recall)
        
    else:
        F1_score = 0
    
    return F1_score

if __name__ == '__main__':
    
    print(multilabel_fscore([2,3], [2,3,4]))

In [None]:
#Now let's get and put the data in  pandas dataframe

order_products_train = pd.read_csv('input/order_products__train.csv')
order_products_prior = pd.read_csv('input/order_products__prior.csv')
orders = pd.read_csv('input/orders.csv')
products = pd.read_csv('input/products.csv')
aisles = pd.read_csv('input/aisles.csv')
departments = pd.read_csv('input/departments.csv')

In [None]:
df_products = products.merge(aisles, on ='aisle_id', how='left')
df_products = df_products.merge(departments, on ='department_id', how='left')
df_products.drop(['aisle_id','department_id'], axis=1,inplace=True)

In [None]:
df_prior = order_products_prior.merge(df_products, on='product_id', how='left')
df_prior.drop(['product_id'], axis=1,inplace=True)

In [None]:
df_train = order_products_train.merge(df_products, on='product_id', how='left')
df_train.drop(['product_id'], axis=1,inplace=True)

In [None]:
df_orders = orders.merge(df_prior, on='order_id', how='left')
df_orders = df_orders.merge(df_train, on='order_id', how='left')

In [None]:
del df_train,df_prior,df_products,order_products_train,order_products_prior,orders,products,aisles,departments
gc.collect()

In [None]:
print("The order_products_train size is : ", order_products_train.shape)
print("The order_products_prior size is : ", order_products_prior.shape)

In [None]:
#find the number of the last order placed
#split orders
test_orders  = df_orders[df_orders['eval_set'] == 'test' ]
prior_orders = df_orders[df_orders['eval_set'] == 'prior']
train_orders = df_orders[df_orders['eval_set'] == 'train']
prior_orders['num_orders'] = prior_orders.groupby(['user_id'])['order_number'].transform(max)
train_orders['num_orders'] = train_orders.groupby(['user_id'])['order_number'].transform(max)
test_orders['num_orders'] = test_orders.groupby(['user_id'])['order_number'].transform(max)

In [None]:
del df_orders
gc.collect()

In [None]:
#save data
pickle.dump(test_orders, open('test_orders.p', 'wb'), protocol=4)

pickle.dump(prior_orders, open('prior_orders.p', 'wb'), protocol=4)

pickle.dump(train_orders, open('train_orders.p', 'wb'), protocol=4)

## Checkpoint
data can be loaded from here

In [None]:
test_orders = pickle.load(open('test_orders.p', mode='rb'))
#prior_orders = pickle.load(open('prior_orders.p', mode='rb'))
train_orders = pickle.load(open('train_orders.p', mode='rb'))

In [None]:
display(train_orders.head())

In [None]:
# concatenate all product-ids into a single string
# thanks to https://www.kaggle.com/eoakley/start-here-simple-submission

def products_concat(series):
    out = ''
    for product in series:
        if product > 0:
            out = out + str(int(product)) + ' '
    
    if out != '':
        return out.rstrip()
    else:
        return 'None'

In [None]:
# Check the number of unique orders and unique products
orders_Unique = len(set(order_products_all.order_id))
products_Unique = len(set(order_products_all.product_id))
print("There are %s orders for %s products" %(orders_Unique, products_Unique))

In [None]:
grouped = order_products_all.groupby("order_id")["add_to_cart_order"].aggregate("max").reset_index()
grouped = grouped.add_to_cart_order.value_counts()

sns.set_style('whitegrid')
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='vertical')
sns.barplot(grouped.index, grouped.values)

plt.ylabel('Number of Orders', fontsize=13)
plt.xlabel('Number of products added in order', fontsize=13)
plt.show()

In [None]:
grouped = order_products_all.groupby("product_id")["reordered"].aggregate({'Total_reorders': 'count'}).reset_index()
grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])
grouped = grouped.sort_values(by='Total_reorders', ascending=False)[:10]
grouped

In [None]:
grouped  = grouped.groupby(['product_name']).sum()['Total_reorders'].sort_values(ascending=False)

sns.set_style('darkgrid')
f, ax = plt.subplots(figsize=(12, 10))
plt.xticks(rotation='vertical')
sns.barplot(grouped.index, grouped.values)
plt.ylabel('Number of Reorders', fontsize=13)
plt.xlabel('Most ordered Products', fontsize=13)
plt.show()

In [None]:
grouped = order_products_all.groupby("product_id")["reordered"].aggregate({'reorder_sum': sum,'reorder_total': 'count'}).reset_index()
grouped['reorder_probability'] = grouped['reorder_sum'] / grouped['reorder_total']
grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])
grouped = grouped[grouped.reorder_total > 75].sort_values(['reorder_probability'], ascending=False)[:10]
grouped