# Naive Reorder Logic Overview

Let us set a really simple baseline - for each user, find the number of reordered items in a basket, call it n, then take their n most frequently ordered items. 

Take the prior, then predict on validation and calculate accuracy

In [2]:
import sqlite3
import numpy as np
import pandas as pd

# Average Reorder Per User

In [3]:
# For each user, calculate average number of reordered items in a basket
conn = sqlite3.connect("instacart.db")
cur = conn.cursor()

# Number of reorders per order
cur.execute("CREATE TABLE reorder_per_order AS "
            "SELECT order_id, SUM(reordered) as reordered "
            "FROM products_prior "
           "GROUP BY order_id;")

conn.commit()
conn.close()

In [None]:
# Join to orders to calculate average per user
conn = sqlite3.connect("instacart.db")
cur = conn.cursor()

# Drop prior table if exists
cur.execute("DROP TABLE IF EXISTS reorder_per_user_prior;")

# Average reorders per user_id
cur.execute("CREATE TABLE reorder_per_user_prior AS "
            "SELECT A.user_id as user_id, AVG(B.reordered) as reorder_avg "
            "FROM orders A INNER JOIN reorder_per_order B "
            "ON A.order_id = B.order_id "
            "WHERE A.order_number > 1 " # Reorders arent possible on first order
            "GROUP BY A.user_id;")

conn.commit()
conn.close()

In [10]:
# Spit out table into a dataframe
conn = sqlite3.connect("instacart.db")
cur = conn.cursor()

reorder_per_user_prior = pd.read_sql_query("SELECT * FROM reorder_per_user_prior;", conn)

# verify that result of SQL query is stored in the dataframe
print(reorder_per_user_prior.head())

conn.close()


   user_id  reorder_avg
0        1     4.555556
1        2     7.153846
2        3     5.000000
3        4     0.250000
4        5     4.666667


# Most Reordered Products Per User

In [None]:
# Join products to orders, count reorders
conn = sqlite3.connect("instacart.db")
cur = conn.cursor()

cur.execute("DROP TABLE IF EXISTS reorder_by_user_product;")

# Calculate reorder_count per user per product
cur.execute("CREATE TABLE reorder_by_user_product AS "
            "SELECT A.user_id as user_id, "
            "  B.product_id as product_id, "
            "  count(*) as reorder_count "
            "FROM orders A INNER JOIN products_prior B "
            "  ON A.order_id = B.order_id "
            "WHERE B.reordered = 1 "
            "GROUP BY A.user_id, B.product_id;")


conn.commit()
conn.close()

In [7]:
# Dump to python
conn = sqlite3.connect("instacart.db")
reordered_products_per_user = pd.read_sql_query("SELECT * FROM reorder_by_user_product;", conn)
conn.close()

In [8]:
# Rank products by user
reordered_products_per_user["reorder_rank"] = reordered_products_per_user.groupby('user_id')['reorder_count'].rank(method="first", ascending=False)

# Predict for Validation

In [5]:
conn = sqlite3.connect("instacart.db")
cur = conn.cursor()

# Grab validation orders and user ids
validation_4_pred = pd.read_sql_query("SELECT order_id, user_id "
                                      "FROM orders "
                                      "WHERE eval_set = 'train' "
                                      "  AND order_id % 10 >= 7;", conn)

conn.close()

In [11]:
# Convert to dictionaries to speed everything up

# Order ID: user_id
validation_4_pred_dict = validation_4_pred.set_index('order_id')['user_id'].to_dict()

# user_id: reorder_avg
reorder_per_user_prior_dict = reorder_per_user_prior.set_index('user_id')['reorder_avg'].to_dict()

In [12]:
# Create a dictionary to house predictions
naive_pred = {}

progress_counter = 0 

# For each order:
for order in validation_4_pred_dict.keys():
    
    # Watch the progress
    progress_counter += 1
    if progress_counter % 1000 == 0:
        print(progress_counter/39588)
        
    # look up the reorder_count by user_id in reorder_per_user_prior
    reorder_count = np.floor(reorder_per_user_prior_dict[validation_4_pred_dict[order]])
    
    # Create an entry in naive_pred:
    naive_pred[order] = set()
      
    # If the reorder count is 0, add "None"
    if reorder_count == 0:
        naive_pred[order].add("None")
    else:
        # Otherwise, add most frequently reordered items:

        # Get the user_id's products
        reorder_products = reordered_products_per_user[reordered_products_per_user.user_id == validation_4_pred_dict[order]]

        # Convert products to dictionary by rank
        reorder_prods_dict = reorder_products.set_index('reorder_rank')['product_id'].to_dict()
      
        # For each rank less than avg:
        for itemNum in range(int(reorder_count)):
            # Add it to the prediction
            naive_pred[order].add(reorder_prods_dict[itemNum+1])


0.02526017985248055
0.0505203597049611
0.07578053955744164
0.1010407194099222
0.12630089926240276
0.15156107911488328
0.17682125896736384
0.2020814388198444
0.22734161867232494
0.2526017985248055
0.27786197837728605
0.30312215822976657
0.32838233808224715
0.3536425179347277
0.37890269778720825
0.4041628776396888
0.42942305749216936
0.4546832373446499
0.47994341719713046
0.505203597049611
0.5304637769020916
0.5557239567545721
0.5809841366070526
0.6062443164595331
0.6315044963120138
0.6567646761644943
0.6820248560169748
0.7072850358694553
0.732545215721936
0.7578053955744165
0.783065575426897
0.8083257552793776
0.8335859351318582
0.8588461149843387
0.8841062948368192
0.9093664746892998
0.9346266545417803
0.9598868343942609
0.9851470142467414


# Read in F1, actual_results

In [4]:
%run F1_score.ipynb
%run Load_actual_results.ipynb





# Score and Record

In [13]:
# Score the naive_reorder model
naive_reorder = f1(naive_pred, actual_results)

True Positives:  887
False Positives: 183617
False Negatives: 253235
Precision:       0.0048074838485886486
Recall:          0.003490449469152612
----------------------------
F1: 0.004044447889545991


In [17]:
# Open connection to instacart.db
import sqlite3
import pandas as pd
con = sqlite3.connect("instacart.db")
cur = con.cursor()

# Insert into model results
cur.execute("INSERT INTO model_results (Model, F1, True_Positives, "
            "False_Positives, False_Negatives) VALUES (?, ?, ?, ?, ?);",
            list(("Naive_Reorder",) + naive_reorder ) )

# Print contents of model_results
print(pd.read_sql_query("SELECT * FROM model_Results;", con))

con.commit()
con.close()

           Model        F1  True_Positives  False_Positives  False_Negatives
0    Dummy Model  0.017609            2586            37002           251536
1  Naive_Reorder  0.004044             887           183617           253235
