In [1]:
# Import-a-ton
import pandas as pd
import sqlite3

# Instacart

## Seting up SQL

In [2]:
connection = sqlite3.connect('./datasets/instacart/sql/instacart.db.sqlite')

## Prodcuts

In [3]:
# Load the products dataset
products = './datasets/instacart/products.csv'
df_product = pd.read_csv(products, encoding = 'utf8')

In [4]:
df_product.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [5]:
# Save it to SQL
df_product.to_sql(name='product',con=connection,if_exists='replace',index=False)

## Orders

In [6]:
# Load the orders dataset
orders = './datasets/instacart/orders.csv'
df_order = pd.read_csv(orders, encoding = 'utf8')

In [7]:
df_order.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [8]:
# Save it to SQL
df_order.to_sql(name='orders',con=connection,if_exists='replace',index=False)

## Orders and Products

In [9]:
# Load the order_products dataset
order_products = './datasets/instacart/order_products.csv'
df_main = pd.read_csv(order_products, encoding = 'utf8')

In [10]:
df_main.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [11]:
# Saving to a SQL file
df_main.to_sql(name='main',con=connection,if_exists='replace',index=False)

## SQL power!

Taking only `user_id`, `product_id`, and the `product_name` to form a new DataFrame and then saving it into it's own table (in SQL) and CSV file. 

This is our `groceries` dataset.

In [12]:
sql_query = '''
SELECT orders.user_id, main.product_id, product.product_name 
FROM main
INNER JOIN product ON product.product_id = main.product_id
INNER JOIN orders ON orders.order_id = main.order_id
'''

In [13]:
# This will join them all but it will take a loooong time
groceries = pd.read_sql(sql_query, con=connection)

In [14]:
groceries.head()

Unnamed: 0,user_id,product_id,product_name
0,202279,33120,Organic Egg Whites
1,202279,28985,Michigan Organic Kale
2,202279,9327,Garlic Powder
3,202279,45918,Coconut Butter
4,202279,30035,Natural Sweetener


In [15]:
groceries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 3 columns):
user_id         int64
product_id      int64
product_name    object
dtypes: int64(2), object(1)
memory usage: 742.4+ MB


In [16]:
# Saving it to SQL
#
# This will also take some time...
groceries.to_sql(name='groceries',con=connection,if_exists='replace',index=False)

In [17]:
# Write to CSV
#
# This will take a longer time, almost as long as the inner join queries
groceries.to_csv('./datasets/instacart/groceries.csv', encoding='utf8')