In [1]:
%matplotlib inline 

# Classification
***

## Table of Contents
***
* [Aim](#aim)
* [Code Setup](#code-setup)
* [Data Import](#data-import)

## Aim <a class="anchor" id="aim"></a>
***

The aim of this notebook is to perform Classification on the Instacart Data

## Code Setup <a class="anchor" id="code-setup"></a>
***

### Constants

In [2]:
instacart_dir = "instacart_2017_05_01/"
my_orders_file = "my_orders.csv"
products_file = "products.csv"
aisles_file = "aisles.csv"
departments_file = "departments.csv"
orders_products_prior_file = "my_order_products__prior.csv"
orders_products_train_file = "my_order_products__train.csv"

### Import Libraries

In [3]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 6]
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("paper")
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

### Useful Functions

In [4]:
def read_csv_file(file_name):
    return pd.read_csv(instacart_dir + file_name)

In [5]:
# Get the relative reoder frequency of orders
def relative_reoder_frequency(os_ps):
    reoder_freq_per_prod_dict = {}
    unique_prod_ids = os_ps.product_id.unique().tolist()
    unique_order_ids = os_ps.order_id.unique().tolist()
    for prod_id in unique_prod_ids:
        orders_for_product = os_ps.query("product_id == " + str(prod_id))
        number_of_times_reordered = orders_for_product.query("reordered == 1").shape[0]
        unique_orders_less_one = len(unique_order_ids) -1
        rel_reorder = 0
        if unique_orders_less_one != 0:
            rel_reorder = round((number_of_times_reordered/unique_orders_less_one),3)
        reoder_freq_per_prod_dict[prod_id] = rel_reorder
    return reoder_freq_per_prod_dict

In [6]:
# Get the relative product count
def product_relative_count(os_ps):
    rel_prod_count_dict = {}
    unique_prod_ids = os_ps.product_id.unique().tolist()
    unique_order_ids = os_ps.order_id.unique().tolist()
    for prod_id in unique_prod_ids:
        orders_for_product = os_ps.query("product_id == " + str(prod_id))
        order_ids_for_these = orders_for_product.order_id.unique().tolist()
        all_of_these_orders = os_ps[os_ps['order_id'].isin(order_ids_for_these)]
        prod_count_for_each_order = all_of_these_orders.groupby("order_id").size().reset_index(name="product_count")
        order_size_counts = prod_count_for_each_order.product_count.tolist()
        relative_order_count = 0
        for order_size in order_size_counts :
            relative_order_count = relative_order_count + 1/order_size
        relative_order_size = round(relative_order_count/len(unique_order_ids),3)
        rel_prod_count_dict[prod_id] = relative_order_size
    return rel_prod_count_dict

In [7]:
def products_reoder_rate_mean(p_r_r_d):
    reorder_dict = {}
    for prod_id, rates in p_r_r_d.items():
        sum_of_rates = sum(rates)
        avg_rate = round(sum_of_rates/len(rates),3)
        reorder_dict[prod_id] = avg_rate
    return reorder_dict

In [8]:
# Using Linear Regression get the reorder rate based on position in cart
def obtain_redorder_rate_per_product_on_cart_position(os_ps):
    reorder_rate_to_position = (os_ps
                            .groupby("add_to_cart_order")["reordered"]
                            .sum()
                            .reset_index(name="reordered_count")
                           )
    total_reorder_count = reorder_rate_to_position.reordered_count.sum()
    
    notreordered_orders_products = os_ps.query("reordered == 0")
    notreorder_rate_to_position = (notreordered_orders_products
                            .groupby("add_to_cart_order")
                            .size()
                            .reset_index(name="not_reordered_count")
                           )
    total_notreorder_count = notreorder_rate_to_position.not_reordered_count.sum()
    order_rates = reorder_rate_to_position.merge(notreorder_rate_to_position, on="add_to_cart_order", how="left")
    order_rates = order_rates.fillna(0)
    order_rates['reorder_rate'] = order_rates['reordered_count'] /(order_rates['not_reordered_count'] + order_rates['reordered_count'])
    x_training_set = order_rates.as_matrix(['add_to_cart_order'])
    y_training_set = order_rates.as_matrix(['reorder_rate'])

    # Use a line as a quadratic gave negative predictions at cart position of 30 and onwards
    # The line gives negative rate after a position of 36 and onwards
    poly = PolynomialFeatures(degree=1)
    x_training_set_transform = poly.fit_transform(x_training_set)

    reorder_rate_reg_model = linear_model.LinearRegression()
    reorder_rate_reg_model.fit(x_training_set_transform,y_training_set)
    
    products_reoder_rate_dict = {}
    distinct_products = os_ps.product_id.unique().tolist()
    for product_id in distinct_products:
        orders_for_product = os_ps.query("product_id == " + str(product_id))
        add_to_cart_order_list = orders_for_product.add_to_cart_order.tolist()
        reorder_rate_list = []
        for add_to_cart_order in add_to_cart_order_list :
            pos_to_test = add_to_cart_order
            position_to_test = poly.fit_transform(pos_to_test)
            preditced_reorder_rate_from_position = reorder_rate_reg_model.predict(position_to_test)[0][0]
            reorder_rate_list.append(preditced_reorder_rate_from_position)
        products_reoder_rate_dict[product_id] = reorder_rate_list
    return products_reoder_rate_mean(products_reoder_rate_dict)

In [9]:
def merge_orders_products(df):
    return df.copy().merge(orders_products, on="order_id")[["order_id", "product_id", "add_to_cart_order","order_number", "reordered"]]

In [10]:
def generate_empty_dfs_with_columns(ords_prods):
    number_of_orders = len(ords_prods.order_id.unique().tolist())
    
    product_mix_column_names = []
    feature_column_names = []
    # This will be every product in all the orders we intend to use for prediction
    product_ids = ords_prods.product_id.unique().tolist()
    product_ids.sort()
    for prod_id in product_ids:
        feature_column_names.append(str(prod_id) + "_rel_count")
        feature_column_names.append(str(prod_id) + "_reorder_freq")
        feature_column_names.append(str(prod_id) + "_reorder_rate")
        product_mix_column_names.append(str(prod_id))

    # Create Data Frame - with number of rows equal to previous orders - 1
    product_mix = pd.DataFrame(columns=product_mix_column_names, index=range(number_of_orders - 1))
    features = pd.DataFrame(columns=feature_column_names, index=range(number_of_orders - 1))
    # Create the test data frames
    test_product_mix = pd.DataFrame(columns=product_mix_column_names, index=range(1))
    test_features = pd.DataFrame(columns=feature_column_names, index=range(1))
    
    # Populate the data frames with 0s
    for column_name in product_mix_column_names:
        product_mix[column_name] = 0
        test_product_mix[column_name] = 0
    for column_name in feature_column_names:
        features[column_name] = 0
        test_features[column_name] = 0
    return (features, product_mix, test_features, test_product_mix)

In [11]:
def populate_product_mix(product_ids, product_mix_df, location):
    new_prod_mix = product_mix_df.copy()
    for prod_id in product_ids:
        prod_string_id = str(prod_id)
        product_column_list = new_prod_mix.columns.values
        # We cannot predict what products we have not seen
        if prod_string_id in product_column_list:
            new_prod_mix.iloc[location, new_prod_mix.columns.get_loc(prod_string_id)] = 1
    return new_prod_mix

In [12]:
def populate_features(ords, features, location):
    new_featutres = features.copy()
    products_reoder_rate = obtain_redorder_rate_per_product_on_cart_position(ords)
    product_reoder_freq = relative_reoder_frequency(ords)
    rel_prod_count = product_relative_count(ords)
 
    # Assign Feature Values to df
    for prod_id in rel_prod_count.keys():
        prod_str_id = str(prod_id)
        new_featutres.iloc[location, new_featutres.columns.get_loc(prod_str_id + "_reorder_freq")] = product_reoder_freq.get(prod_id)
        new_featutres.iloc[location, new_featutres.columns.get_loc(prod_str_id + "_rel_count")] = rel_prod_count.get(prod_id)
        new_featutres.iloc[location, new_featutres.columns.get_loc(prod_str_id + "_reorder_rate")] = products_reoder_rate.get(prod_id)
    return new_featutres

In [13]:
# A Massive function for crunching the data and putting it into the train and test formats we want
def obtain_features_and_product_mix(training_orders, test_order):
    number_of_test_orders = test_order.shape[0]
    if number_of_test_orders != 1:
        raise ValueError('You should only have one next product mix to predict')
    
    number_of_orders = training_orders.shape[0]
    
    training_orders_prod = merge_orders_products(training_orders)    
    test_orders_prod = merge_orders_products(test_order)
    
    features, product_mix, test_features, test_product_mix = generate_empty_dfs_with_columns(training_orders_prod)
    
    # For the Training DFs
    # Fill the feature and product mix train dfs
    for i in range(2, number_of_orders + 1):
        orders_including_current = training_orders.copy().head(i)
        orders_before_current = orders_including_current.copy().head(i - 1)
        # This is the position of the DF we will fill, usual i loop headaches 
        j = i - 2
        
        orders_before_current = merge_orders_products(orders_before_current)
        current_order = orders_including_current.sort_values("order_id", ascending=False).head(1)
        current_order_products = merge_orders_products(current_order)

        products_for_current_order = current_order_products.product_id.unique().tolist()
        
        # Assign Features
        features = populate_features(orders_before_current, features, j)
        
        # Assign Product Mix
        product_mix = populate_product_mix(products_for_current_order, product_mix, j)

        
    # For the Test DFs
    # Assign test features to test df
    test_features = populate_features(training_orders_prod, test_features, 0)
    # Assign product mix to test df
    products_for_test_order = test_orders_prod.product_id.unique().tolist()
    test_product_mix = populate_product_mix(products_for_test_order, test_product_mix, 0)
    
    return (features, product_mix, test_features, test_product_mix)

In [14]:
def find_product_from_ids(arr, product_id_list, products):
    product_ids = train_product_mix.columns.values
    predicted_product_ids = []
    for index, value in enumerate(arr):
        if(value == 1):
            predicted_product_ids.append(product_id_list[index])
    found_products = products[products['product_id'].isin(predicted_product_ids)].sort_values("product_id", ascending=True)
    return found_products

### Data Load

In [15]:
orders = read_csv_file(my_orders_file)
products = read_csv_file(products_file)
aisles = read_csv_file(aisles_file)
orders_products_prior = read_csv_file(orders_products_prior_file)
orders_products_train = read_csv_file(orders_products_train_file)
orders_products = pd.concat([orders_products_prior, orders_products_train])
orders_per_user = []
user_ids = orders.user_id.unique().tolist()
for user_id in user_ids:
    orders_per_user.append(orders.query("user_id == " + str(user_id)))
print("There are ", len(user_ids), "users with a corresponding entry in order history list who's length is also", len(orders_per_user))

There are  20 users with a corresponding entry in order history list who's length is also 20


## Classification for Single User
***

In [16]:
specific_user_order_df = orders_per_user[4]
user_id = specific_user_order_df.user_id.tolist()[0]
print("This user", user_id , "has ", specific_user_order_df.shape[0], " orders")
specific_user_order_df.head()

This user 47562 has  89  orders


Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
26,790798,2178653,47562,prior,1,4,9,
27,790799,3355208,47562,prior,2,2,9,5.0
28,790800,213516,47562,prior,3,4,7,2.0
29,790801,1498655,47562,prior,4,2,7,5.0
30,790802,324368,47562,prior,5,3,8,1.0


In [17]:
max_order_number = specific_user_order_df.order_number.max()
next_user_order = specific_user_order_df.query("order_number == " + str(max_order_number))
train_user_orders = specific_user_order_df.query("order_number < " + str(max_order_number)).sort_values("order_id", ascending=True)
train_user_orders.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
59,790831,98406,47562,prior,34,3,8,2.0
47,790819,117008,47562,prior,22,5,9,1.0
94,790866,128325,47562,prior,69,4,15,3.0
55,790827,137193,47562,prior,30,3,8,5.0
75,790847,139812,47562,prior,50,2,11,1.0


In [18]:
train_features, train_product_mix, test_features, test_product_mix = obtain_features_and_product_mix(train_user_orders, next_user_order)

KeyboardInterrupt: 

In [None]:
clf = OneVsRestClassifier(SVC(kernel='poly'))
clf.fit(train_features, train_product_mix)
clf.predict(train_features)

As can be seen above, prediction only seems to be happening in the 2nd columns (index of 1), 6th and 7th columns (index of 5 and 6 respectively). This may be a result of some of the features bringing the predictors to 0.

In [None]:
train_product_mix.as_matrix()

In [None]:
clf.score(train_features, train_product_mix)

Out of the 4 training cases, it predicted one of the training cases fully. Which given there were only 5 test orders so only 4 of which can be predicted, this is not too bad

In [None]:
predicted_product_mix = clf.predict(test_features).flatten().tolist()
print(predicted_product_mix)

In [None]:
actual_product_mix = test_product_mix.as_matrix()[0].tolist()
print(actual_product_mix)

If we score the test data against the predicted we get a 0 however, if we looks at what it did predict correctly, it go 2 of the next ordered product predicted and miss predicted 1 so this is not a bad result. The 2nd, 6th and 7th columns were all populated so the model seems biased towards though, though the results were quite good.

Let's look at what the products actually were.

In [None]:
found_predicted_products = find_product_from_ids(predicted_product_mix, train_product_mix.columns.values, products)
found_predicted_products.head(found_predicted_products.shape[0])

In [None]:
actual_products = find_product_from_ids(actual_product_mix, train_product_mix.columns.values, products)
actual_products.head(actual_products.shape[0])

Overall the classification seems to have worked well in that this customer did buy a snack and washing powder again.

## Classification for All Orders
***

In [None]:
max_order_number = orders.order_number.max()
next_order = orders.query("order_number == " + str(max_order_number))
train_orders = orders.query("order_number < " + str(max_order_number)).sort_values("order_id", ascending=True)
train_user_orders.head()

In [None]:
train_features, train_product_mix, test_features, test_product_mix = obtain_features_and_product_mix(train_orders, next_order)

## Comments
***
Wanted to user a neural network but didn't have enough data on a per user basis to do this

## Initial EDA for Classification
***

## Take a specific User

In [None]:
specific_user_order_df = orders_per_user[0]
user_id = specific_user_order_df.user_id.tolist()[0]
print("This user", user_id , "has ", specific_user_order_df.shape[0], " orders")
specific_user_order_df.head()

### Split into Train and Test Set

In [None]:
max_order_number = specific_user_order_df.order_number.max()
next_user_order = specific_user_order_df.query("order_number == " + str(max_order_number))
train_user_orders = specific_user_order_df.query("order_number < " + str(max_order_number))
next_user_order.head()

### Join in the Order Products

In [None]:
train_user_orders_prod = train_user_orders.merge(orders_products, on="order_id")[["order_id", "product_id", "add_to_cart_order","order_number", "reordered"]]
train_user_orders_prod.head()

## Reorder Rate vs Add to Cart Investigation
***
Note below was used to aid in the creation of the : obtain_redorder_rate_per_product_on_cart_position function

In [None]:
orders_products.head()

In [None]:
reorder_rate_to_position = (orders_products
                            .groupby("add_to_cart_order")["reordered"]
                            .sum()
                            .reset_index(name="reordered_count")
                           )
total_reorder_count = reorder_rate_to_position.reordered_count.sum()
print("Total times products reordered :", total_reorder_count)
reorder_rate_to_position.head()

In [None]:
fig, ax = plt.subplots()
reorder_rate_to_position['reordered_count'].plot.bar()
ax.set_xticklabels(reorder_rate_to_position["add_to_cart_order"], rotation='horizontal')
plt.title("Reorder Count vs Position in Cart")
plt.ylabel("Number of Reorders for that position")
plt.xlabel("Cart Position")
plt.show()

This graphs shows that the item that is first in the cart is the most reordered

### Not Reordered to Position

In [None]:
notreordered_orders_products = orders_products.query("reordered == 0")
notreorder_rate_to_position = (notreordered_orders_products
                            .groupby("add_to_cart_order")
                            .size()
                            .reset_index(name="not_reordered_count")
                           )
total_notreorder_count = notreorder_rate_to_position.not_reordered_count.sum()
print("Total times products not reordered :", total_notreorder_count)
notreorder_rate_to_position.head()

In [None]:
fig, ax = plt.subplots()
notreorder_rate_to_position['not_reordered_count'].plot.bar()
ax.set_xticklabels(notreorder_rate_to_position["add_to_cart_order"], rotation='horizontal')
plt.title("Not Reorder Count vs Position in Cart")
plt.ylabel("Number of Orders for that position there were not a reorder")
plt.xlabel("Cart Position")
plt.show()

In [None]:
order_rates = notreorder_rate_to_position.merge(reorder_rate_to_position, on="add_to_cart_order")
order_rates.head()

In [None]:
order_rates['reorder_rate'] = order_rates['reordered_count'] /(order_rates['not_reordered_count'] + order_rates['reordered_count'])
order_rates.head()

In [None]:
fig = plt.figure()
plt.plot(order_rates["add_to_cart_order"], order_rates['reorder_rate'], "-o")
plt.title("Reorder Rate by Cart Position")
plt.xlabel("Cart Position")
plt.ylabel("Reorder Rate")

### Regression Model with 1st Order Polynomial - Add to Cart Reorder Rate

As can be seen the graph breaks down after about the 20th position in the cart

In [None]:
x_training_set = order_rates.as_matrix(['add_to_cart_order'])
y_training_set = order_rates.as_matrix(['reorder_rate'])

# Use a line as a quadratic gave negative predictions at cart position of 30 and onwards
# The line gives negative rate after a position of 36 and onwards
poly = PolynomialFeatures(degree=1)
x_training_set_transform = poly.fit_transform(x_training_set)

reorder_rate_reg_model = linear_model.LinearRegression()
reorder_rate_reg_model.fit(x_training_set_transform,y_training_set)

train_score = reorder_rate_reg_model.score(x_training_set_transform, y_training_set)

pos_to_test = 35
position_to_test = poly.fit_transform(pos_to_test)
preditced_reorder_rate_from_position = reorder_rate_reg_model.predict(position_to_test)[0][0]

print("The model score :", train_score, " using the training data and for a cart position of :",pos_to_test, 
      "the model preditced a reorder rate of :", preditced_reorder_rate_from_position)