# Instacart Data Exploration


## Introduction
- Goal: Predict items in a customer's NEXT order
- Open data files, explore CSVs
- Plot some preliminary exploratory visualization (Market Basket/Collab filtering?)

In [2]:
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd


# File reading packages
import pickle
import json
import csv
import datetime # For handling dates

# The "requests" library makes working with HTTP requests easier
import requests
import os


## Read data files

### Supermarket Organization: Aisles, Departments, and Products 
- Contain key and description for each aisle, department
- 134 aisles, 21 departments
- Might consider putting them in **dictionary** structure

TODO: 
- Which aisles associated with which departments?
- Print produts in same aisle, dept for example

In [8]:
data_wd = os.path.join( "C:\\", "Users", "alex", "Documents", "Instacart_Data", "data" )
aisles = pd.read_csv(data_wd + r"\aisles.csv")

In [11]:
print(aisles.shape)
aisles.head(5)

(134, 2)


Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [16]:
departments = pd.read_csv(data_wd + r"\departments.csv")
print(departments.shape)
departments.head(5)

(21, 2)


Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [15]:
products = pd.read_csv(data_wd + r"\products.csv")
print(products.shape)
products.head(5)

(49688, 4)


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [24]:
# table = pd.pivot_table(products, index=['aisle_id'],
#                     columns=['department_id'], aggfunc=np.sum)

# For each department, print associated aisles


### Customer Data

- Orders: which set the order belongs to (prior, train, test), day of week (dow), hour of day, days since last order (NaN if first order)
- Split this into Prior, Train, and Test dataframes

In [20]:
orders = pd.read_csv(data_wd + r"\orders.csv")
print(orders.shape)
orders.head(5)

(3421083, 7)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [18]:
order_products_prior = pd.read_csv(data_wd + r"\order_products__prior.csv")
print(order_products_prior.shape)
order_products_prior.head(5)

(32434489, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [19]:
order_products_train = pd.read_csv(data_wd + r"\order_products__train.csv")
print(order_products_train.shape)
order_products_train.head(5)

(1384617, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
