# Instacart

## Imports

In [1]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib as plt
import time
import multiprocessing as mp
from datetime import datetime


In [2]:
pd.set_option("display.max_rows", None, "display.max_columns", None)


In [4]:
data_path = Path.cwd() / "data"
print(data_path)

c:\Users\Super\Desktop\Python-master\Python_Important\instacart\data


## First Look at Datasets

In [5]:
# datasets = []

# for i in os.listdir(data_path):
#     if i[-4:] == ".csv":
#         datasets.append(i)

# print(datasets)

datasets = sorted([i for i in  os.listdir(data_path) if i[-4:] == ".csv"])
print(datasets)

['aisles.csv', 'departments.csv', 'order_products__prior.csv', 'order_products__train.csv', 'orders.csv', 'products.csv', 'sample_submission.csv']


In [6]:
for i in datasets:
    print(i)
    
    exec(f"{i[:-4]} = pd.read_csv(data_path / i)")
    exec(f"display({i[:-4]}.head())")
    exec(f"print(len({i[:-4]}))")


aisles.csv


Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


134
departments.csv


Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


21
order_products__prior.csv


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


32434489
order_products__train.csv


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


1384617
orders.csv


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


3421083
products.csv


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


49688
sample_submission.csv


Unnamed: 0,order_id,products
0,17,39276 29259
1,34,39276 29259
2,137,39276 29259
3,182,39276 29259
4,257,39276 29259


75000


### Understanding how the datasets fit together

In [7]:
for i in datasets:
    print("\n")
    print(i[:-4])
    print(list(eval(f"{i[:-4]}.columns")))
    # print(list(eval(f"{i[:-4]}.unique()")))
    for j in eval(f"{i[:-4]}"):
        print(j)
        #print(list(eval(f"{i[:-4]}{[j]}.unique()")))



aisles
['aisle_id', 'aisle']
aisle_id
aisle


departments
['department_id', 'department']
department_id
department


order_products__prior
['order_id', 'product_id', 'add_to_cart_order', 'reordered']
order_id
product_id
add_to_cart_order
reordered


order_products__train
['order_id', 'product_id', 'add_to_cart_order', 'reordered']
order_id
product_id
add_to_cart_order
reordered


orders
['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']
order_id
user_id
eval_set
order_number
order_dow
order_hour_of_day
days_since_prior_order


products
['product_id', 'product_name', 'aisle_id', 'department_id']
product_id
product_name
aisle_id
department_id


sample_submission
['order_id', 'products']
order_id
products


## Join and Merge

In [7]:
order_products = order_products__prior.append(order_products__train)
order_products =order_products.sample(frac = 0.2)
display(order_products.head())
print(len(order_products))

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
22288791,2350962,4193,15,1
5283194,557619,19382,6,1
13398584,1414078,22504,1,0
9827183,1037684,13838,7,1
18105907,1909825,15290,21,1


6763821


In [8]:
orders_and_products = order_products.copy()

In [9]:
orders_and_products = orders_and_products.merge(products, how = "left",on = "product_id")

In [10]:
display(orders_and_products[orders_and_products["order_id"] == 1])
print(len(orders_and_products))

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
3832367,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16
4131595,1,10246,3,0,Organic Celery Hearts,83,4
5638183,1,22035,8,1,Organic Whole String Cheese,21,16


6763821


In [11]:
orders_and_products = orders_and_products.merge(aisles, how = "left",on = "aisle_id")
orders_and_products = orders_and_products.merge(departments, how = "left",on = "department_id")


In [12]:
display(orders_and_products[orders_and_products["order_id"] == 1])
print(len(orders_and_products))

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
3832367,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs
4131595,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce
5638183,1,22035,8,1,Organic Whole String Cheese,21,16,packaged cheese,dairy eggs


6763821


In [13]:
orders_and_products = orders_and_products.merge(orders, how = "left",on = "order_id")

In [14]:
display(orders_and_products[orders_and_products["order_id"] == 1])
print(len(orders_and_products))

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
3832367,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs,112108,train,4,4,10,9.0
4131595,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce,112108,train,4,4,10,9.0
5638183,1,22035,8,1,Organic Whole String Cheese,21,16,packaged cheese,dairy eggs,112108,train,4,4,10,9.0


6763821


## Augmenting Data

### Functions

In [15]:
def find_items_in_order(product_id):
    x = order_products[product_id == order_products["product_id"]]
    orders = list(x["order_id"].unique())
    items = 0
    for i in orders:
        items += len(order_products[i == order_products["order_id"]]["product_id"].unique())
    if len(orders) == 0:
        return 0
    return items / len(orders)



In [16]:
def find_items_in_order_mp(df, data_path, counter):
    start = time.time()
    idxs = list(df.index)
    # df["counter"] = counter
    for i in idxs:
        
        product_id = df.loc[i, "product_id"]
        x = order_products[product_id == order_products["product_id"]]
        orders = list(x["order_id"].unique())
        items = 0
        for j in orders:
            items += len(order_products[j == order_products["order_id"]]["product_id"].unique())
        if len(orders) == 0:
            df.loc[i, "items_in_order_average"] = 0
        else:
            df.loc[i, "items_in_order_average"] = items / len(orders)
        
        
        

        if counter < 1 and i % 24 == 0:
            now = time.time()
            print(" \n\n\nthe program has elapsed for (mins): " + str(round(((start - time)/60), 2)))
            print("percentage completed: " + str(round(idxs.index(i) * 100 / len(df))))
            df.loc[i, "counter"] = counter
        
        counter += 1e-8
        
        
        
    # queue.put(results)
    df.to_csv(data_path / "augmented" / f"results_{int(counter)}.csv")

In [17]:
def find_order_times(products_info, product_id, index):
    x = order_products[1 == order_products["product_id"]]
    order_days = list(x["order_dow"])
    for i in order_days:
        products_info.loc[index, f"day_of_week:{i}"] += 1


    order_hours = list(x["order_hour_of_day"])
    for i in order_hours:
        products_info.loc[index, f"hour_of_day:{i}"] += 1
    
    return products_info


### Single Loop

In [18]:
order_products = order_products__prior.append(order_products__train)
order_products = order_products.sample(frac = 0.2)

products_info = products.copy()
display(products_info.head())
print(len(products_info))

display(order_products.head())
print(len(order_products))

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


49688


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
10049389,1061111,19679,6,0
8031485,847830,15290,1,0
8117045,856794,34262,29,0
20520365,2164630,42342,8,1
5925471,625380,34169,11,1


6763821


In [19]:
order_products = order_products.merge(orders, how = "left",on = "order_id")

In [20]:
display(order_products[order_products["order_id"] == 1])

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
835851,1,13176,6,0,112108,train,4,4,10,9.0
1096670,1,47209,7,0,112108,train,4,4,10,9.0
3006041,1,49683,4,0,112108,train,4,4,10,9.0


In [21]:
products_info["times_ordered_average"] = 0
products_info["add_to_cart_order_average"] = 0

products_info["days_since_prior_order_average"] = 0
products_info["times_reordered_average"] = 0
products_info["items_in_order_average"] = 0
for i in range(7):
    products_info[f"day_of_week:{i}"] = 0
for i in range(24):
    products_info[f"hour_of_day:{i}"] = 0


In [22]:
display(products_info.head(15))

Unnamed: 0,product_id,product_name,aisle_id,department_id,times_ordered_average,add_to_cart_order_average,days_since_prior_order_average,times_reordered_average,items_in_order_average,day_of_week:0,day_of_week:1,day_of_week:2,day_of_week:3,day_of_week:4,day_of_week:5,day_of_week:6,hour_of_day:0,hour_of_day:1,hour_of_day:2,hour_of_day:3,hour_of_day:4,hour_of_day:5,hour_of_day:6,hour_of_day:7,hour_of_day:8,hour_of_day:9,hour_of_day:10,hour_of_day:11,hour_of_day:12,hour_of_day:13,hour_of_day:14,hour_of_day:15,hour_of_day:16,hour_of_day:17,hour_of_day:18,hour_of_day:19,hour_of_day:20,hour_of_day:21,hour_of_day:22,hour_of_day:23
0,1,Chocolate Sandwich Cookies,61,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,All-Seasons Salt,104,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,Robust Golden Unsweetened Oolong Tea,94,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,Green Chile Anytime Sauce,5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,6,Dry Nose Oil,11,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,7,Pure Coconut Water With Orange,98,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,8,Cut Russet Potatoes Steam N' Mash,116,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,9,Light Strawberry Blueberry Yogurt,120,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
#all lines: 1 min 1 sec
# not last line: 1 min 0.4 sec
# not items in order average: 2.15 sec

In [24]:
%%time
for i in range(len(products_info)):
#     if i == 3:
#         break
    product_id = products_info["product_id"][i]
    products_info["times_ordered_average"][i] = len(order_products.loc[order_products["product_id"] == product_id]["order_id"].unique())
    products_info["add_to_cart_order_average"][i] = np.mean(order_products.loc[order_products["product_id"] == product_id]["add_to_cart_order"])
    products_info["days_since_prior_order_average"][i] = np.mean(order_products.loc[order_products["product_id"] == product_id]["days_since_prior_order"])
    products_info["times_reordered_average"][i] = np.mean(order_products[order_products["product_id"] == product_id]["reordered"])
#     products_info["items_in_order_average"][i] = find_items_in_order(product_id)
    products_info = find_order_times(products_info, product_id, i)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [None]:
display(products_info.head())

Unnamed: 0,product_id,product_name,aisle_id,department_id,times_ordered_average,add_to_cart_order_average,days_since_prior_order_average,times_reordered_average,items_in_order_average,day_of_week:0,day_of_week:1,day_of_week:2,day_of_week:3,day_of_week:4,day_of_week:5,day_of_week:6,hour_of_day:0,hour_of_day:1,hour_of_day:2,hour_of_day:3,hour_of_day:4,hour_of_day:5,hour_of_day:6,hour_of_day:7,hour_of_day:8,hour_of_day:9,hour_of_day:10,hour_of_day:11,hour_of_day:12,hour_of_day:13,hour_of_day:14,hour_of_day:15,hour_of_day:16,hour_of_day:17,hour_of_day:18,hour_of_day:19,hour_of_day:20,hour_of_day:21,hour_of_day:22,hour_of_day:23
0,1,Chocolate Sandwich Cookies,61,19,1928,5.0,10.0,0.0,0,206,414,285,272,308,318,125,12,12,9,5,4,7,13,39,82,146,213,175,163,175,175,130,145,142,67,55,37,48,36,38
1,2,All-Seasons Salt,104,13,94,10.0,10.0,0.0,0,206,414,285,272,308,318,125,12,12,9,5,4,7,13,39,82,146,213,175,163,175,175,130,145,142,67,55,37,48,36,38
2,3,Robust Golden Unsweetened Oolong Tea,94,7,283,6.0,10.0,0.0,0,206,414,285,272,308,318,125,12,12,9,5,4,7,13,39,82,146,213,175,163,175,175,130,145,142,67,55,37,48,36,38
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,351,9.0,14.0,0.0,0,206,414,285,272,308,318,125,12,12,9,5,4,7,13,39,82,146,213,175,163,175,175,130,145,142,67,55,37,48,36,38
4,5,Green Chile Anytime Sauce,5,13,16,6.0,13.0,0.0,0,206,414,285,272,308,318,125,12,12,9,5,4,7,13,39,82,146,213,175,163,175,175,130,145,142,67,55,37,48,36,38


In [None]:
print(len(products_info))
test = products_info.sample(frac=0.005)
print(len(test))
test["items_in_order_average"] = test["product_id"].apply(find_items_in_order)
# previous line same as: products_info["items_in_order_average"][i] = find_items_in_order(product_id)
# test["example"] = test["product_id"].apply(lambda x:x += 10)

# test["example"] = test["product_id"] + 10


49688
248


In [None]:
products_info.to_csv(data_path / "augmented" / "products_info_part1.csv", index = False)

In [None]:
products_info = pd.read_csv(data_path / "augmented" / "products_info_part1.csv")

### Multiprocessing

In [None]:
# n_jobs = 10

# for i in range(n_jobs):
#     #launch a task
#     pass

In [None]:

n_jobs = 7
# df_split is type list containg 8 pd.DFs
df_split = np.array_split(products_info, n_jobs)
# queue = mp.Queue()

counter = 0
for i in range(n_jobs):
    p = mp.Process(target=find_items_in_order_mp, args=(df_split[i], data_path, counter))
    p.start()
    p.join()
    counter += 1


NameError: name 'mp' is not defined

In [None]:

# results = [queue.get(i) for i in queue]

results = []
for i in range(n_jobs):
    results.append(pd.read_csv(data_path / "augmented" / f"results_{i}.csv"))
    # os.remove(data_path / "augmented" / f"results_{i}.csv")
df = pd.concat(results)
df.sort_values("counter", inplace =  True)

df.to_csv(data_path / "augmented" / "products_info.csv")
display(df.head())

## Kaggle Style

In [10]:
order_products = order_products__prior.append(order_products__train)
print(len(order_products))
display(order_products.head())

33819106


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [11]:
orders_df = orders.copy()
print(len(orders_df))
display(orders_df.head())

3421083


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [12]:
# If this cell errors, re run all cells below kaggle style becaue if cell is run twice, duplicate colunm names get appended with _x and _y.
order_products = order_products.merge(products,how="left", on="product_id")
order_products = order_products.merge(departments,how="left", on="department_id")
order_products = order_products.merge(aisles,how="left", on="aisle_id")


In [13]:
display(order_products.head())

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,department,aisle
0,2,33120,1,1,Organic Egg Whites,86,16,dairy eggs,eggs
1,2,28985,2,1,Michigan Organic Kale,83,4,produce,fresh vegetables
2,2,9327,3,0,Garlic Powder,104,13,pantry,spices seasonings
3,2,45918,4,1,Coconut Butter,19,13,pantry,oils vinegars
4,2,30035,5,0,Natural Sweetener,17,13,pantry,baking ingredients


In [14]:
display(order_products.loc[order_products["order_id"] == 112])

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,department,aisle
32434570,112,27104,1,1,Fresh Cauliflower,83,4,produce,fresh vegetables
32434571,112,21174,2,1,I Heart Baby Kale,123,4,produce,packaged vegetables fruits
32434572,112,41860,3,0,Sea Salt Baked Potato Chips,107,19,snacks,chips pretzels
32434573,112,38273,4,0,Marinara Pasta Sauce,9,9,dry goods pasta,pasta sauce
32434574,112,47209,5,0,Organic Hass Avocado,24,4,produce,fresh fruits
32434575,112,5876,6,1,Organic Lemon,24,4,produce,fresh fruits
32434576,112,29217,7,0,Coconut Water Kefir,31,7,beverages,refrigerated
32434577,112,9047,8,0,Premium Epsom Salt,133,11,personal care,muscles joints pain relief
32434578,112,4549,9,0,Umcka Elderberry Intensive Cold + Flu Berry Fl...,11,11,personal care,cold flu allergy
32434579,112,22425,10,0,Hickory Honey Barbeque Baked Potato Chips,107,19,snacks,chips pretzels


## Functions

In [15]:
def get_items(order_id):
    x = order_products.loc[order_products["order_id"] == order_id]
    lst = []
    for i in x.index:
        lst.append(order_products.loc[i, "product_id"])
    lst = list(sorted(set(lst)))
    lst = [str(i) for i in lst]
    return " ".join(lst)

def get_items_mp(df):
    pass


In [16]:
print(get_items(112))

4549 5876 9047 11776 21174 22425 27104 29217 38273 41860 47209


In [18]:
# for loop: 118
test = orders_df.sample(frac=0.001)
test["items"] = ""
print(len(orders_df))
print(len(test))

test["items"] = test["order_id"].apply(get_items)
# for i in test.index:
#     test.loc[i, "items"] = get_items(test.loc[i, "order_id"])



3421083
3421


In [None]:
orders_df = orders_df.sample(frac=0.5)
n_jobs = 7
df_split = np.array_split(orders_df, n_jobs)
jobs = []
counter = 0
for i in range(n_jobs):
    task = mp.Process(target = get_items_mp, args = (df_split[i]))
    jobs.append(task)
    counter += 1





for task in jobs:
    task.start()




for task in jobs:
    task.join()
    