# Assignment 2 - Instacart Analysis

## Aim

This assignment is based on the __Instacart data set__, and we want you to perform some exploratory data analysis tasks and construct a number of models:
 
To ensure you all have a comparable but different expreience in this assignmet we will slice the Instacart based on your student ID. 

To reduce time when reading data, your slice will be saved in a new set of CSV files (with prefix my_). Also the size of the slice can be modified via const __MAX_USERS__ --- smaller means faster but less reliable analyis. 

We considered wrapping this code up in a separate script but thought that it would be useful for you to see how the slices were constucted and saved.

 * [The Instacart Online Grocery Shopping Dataset 2017](https://www.instacart.com/datasets/grocery-shopping-2017), <br />
Accessed from https://www.instacart.com/datasets/grocery-shopping-2017 on February 2018.
 * For details of the columns in the CSV files see [https://gist.github.com/jeremystan/c3b39d947d9b88b3ccff3147dbcf6c6b](https://gist.github.com/jeremystan/c3b39d947d9b88b3ccff3147dbcf6c6b)

In [1]:
# This data is used to slice the Instacart dataset and to scale the problem
# 
# The reduced dataset files (instacart_2017_05_01/my_*.csv) need to be manually deleted whenever these values change 
# or change flag force_rebuild to True

NAME = "Colm Carew" # Change this to your name
ID = 20053766 # Change this to your student number

# Number of customers in your slice 
# For 
MAX_USERS = 20                # Was 2, <206209
MAX_PRODUCTS = 2000

use_all = False # True
force_rebuild = True

## Load Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("paper")
from itertools import combinations, groupby
from collections import Counter
import sys, os, glob
import tarfile
import urllib.request

## Download the Instacart Data

In [3]:
instacart_dir = "instacart_2017_05_01"
instacart_file_name = "instacart_online_grocery_shopping_2017_05_01.tar.gz"
if (not os.path.isfile(instacart_file_name)) and (not os.path.isdir(instacart_dir)) :
    #Ask User to Download File
    file_url = "https://s3.amazonaws.com/instacart-datasets/instacart_online_grocery_shopping_2017_05_01.tar.gz"
    print("Please Wait until file downloads")
    urllib.request.urlretrieve(file_url, instacart_file_name)
    print(instacart_file_name, " successfully downloaded")

## Extract the Instacart Data

In [4]:
if ((not os.path.isdir(instacart_dir)) and (os.path.isfile(instacart_file_name))) :
    print("Please wait until", instacart_file_name, " is extracted")
    tar = tarfile.open(instacart_file_name, "r:gz")
    tar.extractall()
    tar.close()
    print(instacart_file_name, " has been extracted")

## Load Full Dataset and Slice Based on Student ID

In [5]:
def size(obj):
    """Return size of object in MB"""
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

In [None]:
if not use_all and (not os.path.isfile(instacart_dir + "/my_orders.csv") or force_rebuild):
    
    # read in full table  
    orders = pd.read_csv(instacart_dir + "/orders.csv")
    print("(ALL) orders -- dimensions: {0};   size: {1}".format(orders.shape, size(orders)))
    
    # take a random slice based on student ID
    np.random.seed(ID)
    my_users = np.random.choice(orders.user_id.unique(), size=MAX_USERS, replace=False)
    my_orders = orders[orders.user_id.isin(my_users)]
    
    # save for later use (speedup loading)
    my_orders.to_csv(instacart_dir + "/my_orders.csv")
                     
# load reduced dataset
orders = pd.read_csv(instacart_dir + "/%sorders.csv" % ("" if use_all else "my_"))

print("orders -- dimensions: {0};   size: {1}".format(orders.shape, size(orders)))
display(orders.head())

(ALL) orders -- dimensions: (3421083, 7);   size: 376.24 MB
orders -- dimensions: (306, 8);   size: 0.04 MB


Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,12202,472199,765,prior,1,2,21,
1,12203,1424394,765,prior,2,1,21,20.0
2,12204,889292,765,prior,3,6,21,30.0
3,12205,2809253,765,prior,4,3,20,30.0
4,12206,2392861,765,prior,5,1,12,19.0


In [None]:
if not use_all and (not os.path.isfile(instacart_dir + "/my_order_products__prior.csv") or force_rebuild):
    
    # read in full table 
    orders_products_prior = pd.read_csv(instacart_dir + "/order_products__prior.csv")
    print("(ALL) order_products_prior -- dimensions: {0};   size: {1}"
        .format(orders_products_prior.shape, size(orders_products_prior)))

    # take slice based on slice of orders
    my_orders = orders.order_id.unique()
    my_orders_products_prior = orders_products_prior[orders_products_prior.order_id.isin(my_orders)]
    
    # save for later use (speedup loading)
    my_orders_products_prior.to_csv(instacart_dir + "/my_order_products__prior.csv")

# load reduced dataset
orders_products_prior = pd.read_csv(instacart_dir + "/%sorder_products__prior.csv" % ("" if use_all else "my_"))

print("orders_products_prior -- dimensions: {0};   size: {1}"
    .format(orders_products_prior.shape, size(orders_products_prior)))
display(orders_products_prior.head())

In [None]:
if not use_all and (not os.path.isfile(instacart_dir + "/my_order_products__train.csv") or force_rebuild):
    
    # read in full table 
    orders_products_train = pd.read_csv(instacart_dir + "/order_products__train.csv")
    print("(ALL) order_products_train -- dimensions: {0};   size: {1}"
        .format(orders_products_train.shape, size(orders_products_train)))

    # take slice based on slice of orders
    my_orders = orders.order_id.unique()
    my_orders_products_train = orders_products_train[orders_products_train.order_id.isin(my_orders)]
    
    # save for later use (speedup loading)
    my_orders_products_train.to_csv(instacart_dir + "/my_order_products__train.csv")

# load reduced dataset
orders_products_train = pd.read_csv(instacart_dir + "/%sorder_products__train.csv" % ("" if use_all else "my_"))

print("orders_products_train -- dimensions: {0};   size: {1}"
    .format(orders_products_train.shape, size(orders_products_train)))
display(orders_products_train.head())

In [None]:
if not use_all and (not os.path.isfile(instacart_dir + "/my_order_products__train.csv") or force_rebuild):
    
    # read in full table 
    orders_products_train = pd.read_csv(instacart_dir + "/order_products__train.csv")
    print("(ALL) order_products_train -- dimensions: {0};   size: {1}"
        .format(orders_products_train.shape, size(orders_products_train)))

    # take slice based on slice of orders
    my_orders = orders.order_id.unique()
    my_orders_products_train = orders_products_train[orders_products_train.order_id.isin(my_orders)]
    
    # save for later use (speedup loading)
    my_orders_products_train.to_csv(instacart_dir + "/my_order_products__train.csv")

# load reduced dataset
orders_products_train = pd.read_csv(instacart_dir + "/%sorder_products__train.csv" % ("" if use_all else "my_"))

print("orders_products_train -- dimensions: {0};   size: {1}"
    .format(orders_products_train.shape, size(orders_products_train)))
display(orders_products_train.head())

In [None]:
products = pd.read_csv(instacart_dir + "/products.csv")
print('products -- dimensions: {0};   size: {1}'
    .format(products.shape, size(products)))
display(products.head())

In [None]:
departments = pd.read_csv(instacart_dir + "/departments.csv")
print('departments -- dimensions: {0};   size: {1}'
    .format(departments.shape, size(departments)))
display(departments.head())

In [None]:
aisles = pd.read_csv(instacart_dir + "/aisles.csv")
print('aisles -- dimensions: {0};   size: {1}'
    .format(aisles.shape, size(aisles)))
display(aisles.head())

## Derived Dataframes

In [None]:
# concatenate the _prior and _train datasets
orders_products = pd.concat([orders_products_prior, orders_products_train])
print("orders_products -- dimensions: {0};   size: {1}"
    .format(orders_products.shape, size(orders_products)))

In [None]:
# expand orders dataframe to include product info 
orders_and_products = orders.merge(orders_products, on="order_id")
print("orders_and_products -- dimensions: {0};   size: {1}"
    .format(orders_and_products.shape, size(orders_and_products)))

In [None]:
# Limit analysis to top products only
top_products = pd.DataFrame({'total_count':orders_products.groupby("product_id").size()})\
    .sort_values('total_count', ascending=False).reset_index()[:MAX_PRODUCTS]
top_products = top_products.merge(products, on='product_id')
top_products.head()

In [None]:
# keep only orders with products in top_products
orders_and_products = orders_and_products.loc[orders_and_products['product_id'].isin(top_products.product_id)]
print("orders_products -- dimensions: {0};   size: {1}"
    .format(orders_products.shape, size(orders_products)))

## Exploratory Data Analysis

To help you get started we want you to preform a number of exploratory data analysis tasks. The tasks are intended to be roughly of the same level of difficulty but some will require input from you --- for example deciding the more suitable chart type, or cutoff points so that the more interesting detail is not lost, etc.

Again the tasks selected are based on your student id that you entered above.

In [None]:
print("List of tasks:\n")
np.random.seed(ID)
for k,task in enumerate(np.random.choice([
    # orders
    {"brief": "Number of Orders in Order History", "description":"Graph showing the frequency of the number of previous order."},
    {"brief": "Distribution of Order Size", "description":"Chart showing the number of products frequency of number of orders in order history."},
    {"brief": "Distrbution of Orders by Hour of Day", "description":"Chart showing the frequency of orders by hour of day."},
    {"brief": "Distrbution of Orders by Hour of Day", "description":"Chart showing the frequency of orders by hour of day."},
    {"brief": "Distrbution of Order Size", "description":"Chart showing the frequency of number of products in orders."},
    # time between orders
    {"brief": "Distrbution of Days Since Previous Order", "description":"Chart showing the frequency of number of days since previous order."},
    # products sold
    {"brief": "Distrbution of Top Selling Products", "description": "Chart dhowing the frequency of top selling products."},
    {"brief": "How Often are Products Reorder?", "description": "Chart showing reordered/not-reordered products."},
    {"brief": "Most Often Reordered Products ", "description": "Which products are reordered most often and which probability?"},
    {"brief": "Number of Times Reordered", "description": "Graph of number of products vs number of times reordered."},
    # customers
    {"brief": "Customer with the Most Reordering", "description": "Chart of customers by percentage of products reordered with respect to total products ordered."},
    {"brief": "How many Distinct Product do Customers Buy? ", "description": "Chart of distribution of number of unique products in a customers history."},
    {"brief": "How many Orders do Customers make?", "description": "Chart of distribution of number of order in order history."},
    {"brief": "Customer Reorder Rate", "description": "Chart of distribution proportion of products reorder with respect to total products per customer."},


    # aisles
    {"brief": "Top Selling Aisles", "description": "Chart of number of products sold by aisle"},
    {"brief": "Number of Products by Aisle", "description": "Chart of number of products in stock by aisle"},
    # departments
    {"brief": "Top Selling Department", "description": "Chart of number of products sold by department"},
    {"brief": "Number of Products by Department", "description": "Chart of number of products in stock by department"},
    {"brief": "Department Size vs Sales", "description": "Graph showin ranking of department size (number of product) vs sales ranking"},
    # department and aisles
    {"brief": "How are Aisles Organized Within Departments?", "description": "Multi-chart showing distribution of products on aisles for various departments."},
    {"brief": "Which Product do People Put into the Cart First?", "description": "Chart of probability of product being first item placed in backset."}

    #{"brief": "", "description": ""}
],size=9, replace=False)):
    print ("Task {:2d}: {}\n\t{}".format(k+1,task['brief'],task['description']))

In [None]:
products.head()

In [None]:
df_orders = pd.read_csv(instacart_dir + "/orders.csv")

In [None]:
df_orders.head()

In [None]:
df_orders.eval_set.unique()

In [None]:
# Get list of user IDs included in test set
test_user_ids = orders[orders.eval_set == "test"].user_id
# Get list of user IDs included in train set
orig_train_user_ids = orders[orders.eval_set == "train"].user_id

users_in_test_set = len(test_user_ids.index)
users_in_train_set = len(orig_train_user_ids.index)

print("Users in test set: {0}.".format(users_in_test_set))
print("Users in train set: {0}.".format(users_in_train_set))

print("\nSplit training into 80:20 - training:validation\n")

users_in_val_set = users_in_train_set // 5
users_in_train_set = users_in_train_set - users_in_val_set

print("Users in train set: {0}".format(users_in_train_set))
print("Users in validation set: {0}".format(users_in_val_set))

In [None]:
75000/(75000+131209)