In [2]:
# Load required libraries
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [3]:
#Main Dataframes
df_orders = pd.read_csv('../instacart-market-basket-analysis/orders.csv')
df_prod_p = pd.read_csv('../instacart-market-basket-analysis/order_products__prior.csv')

In [4]:
def get_uids():
    return list(df_orders['user_id'].drop_duplicates())

def get_chrono_oids_by_uid(uid, inc_train = False):
    if inc_train:
        associated_oid = list(df_orders.loc[(df_orders['user_id'] == uid)]['order_id'])
    else:
        associated_oid = list(df_orders.loc[(df_orders['user_id'] == uid) & (df_orders['eval_set'] == 'prior')]['order_id'])
    return list(associated_oid)

def df_orders_by_uid(uid):
    return df_orders.loc[(df_orders['user_id'] == uid)]


def df_prod_by_uid(uid, oids):
    associated_oid = oids
    master = pd.DataFrame()
    for oid in associated_oid:
        master = master.append(df_prod_p.loc[df_prod_p['order_id'] == oid])
    return master




def prod_info_from_df(sdf_prod):
    prod_set      = set()
    ord_prod_dict = dict()
    for i, rows in sdf_prod.iterrows():
        oid  = sdf_prod.loc[i,'order_id']
        pid  = sdf_prod.loc[i,'product_id']
        prod_set.add(pid)
        if ord_prod_dict.get(oid) == None:
            ord_prod_dict[oid] = [pid]
        else:
            ord_prod_dict[oid] = ord_prod_dict.get(oid) + [pid]
    return (prod_set, ord_prod_dict)

def df_from_sprod_info(sprod_info):
    key = sprod_info[0]
    val = [0 for x in range(len(key))]
    d = list()
    for order in sprod_info[1].keys():
        o_dict = dict(zip(key,val))
        pids = sprod_info[1][order]
        for pid in pids:
            o_dict[pid] = 1
        d.append(o_dict)
    p_info = pd.DataFrame(data = d, index = sprod_info[1].keys())
    p_info.index.name = "order_id"
    return p_info

def gen_training_data(df):
    s_pids = list(df.columns)
    features = list()
    labels = list()
    for i in range(len(s_pids)):
        feat = list(df[s_pids[i]])
        features.append(feat[0:-1])
        labels.append(feat[-1])
    return features, labels, s_pids
    

def gen_prediction_result(X, y, pids):
    # Split the data into 70% training data and 30% test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # Create a perceptron object with the parameters: 40 iterations (epochs) over the data, and a learning rate of 0.1
    ppn = Perceptron(max_iter = 40, eta0=0.1, random_state=0)

    # Train the perceptron
    ppn.fit(X_train, y_train)

    # Apply the trained perceptron on the X data to make predicts for the y test data
    y_pred = ppn.predict(X_test)

    # View the predicted y test data
    y_pred

    # View the true y test data
    y_test

    return accuracy_score(y_test, y_pred)

In [9]:
def gen_user_prediction(uid):
    s_oids= get_chrono_oids_by_uid(s_uid, True)

    sdf_prod = df_prod_by_uid(s_uid, s_oids)

    sdf_orders = df_orders_by_uid(s_uid)

    sprod_info = prod_info_from_df(sdf_prod)

    ordered_df = df_from_sprod_info(sprod_info)
    
    X, y, pids = gen_training_data(ordered_df)
    
    return gen_prediction_result(X, y, pids)


In [10]:
uids = get_uids()
s_uid = uids[0]

In [11]:
print (gen_user_prediction(1))

0.8333333333333334


What individual factors are we going to use as the basis of our predictor?
* aisle_id
* department_id
* add_to_cart_order
* ordered
* reordered
* order_dow
* order_hour_of_day
* days_since_prior_order


In [13]:
df_orders

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [14]:
df_prod_p

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
5,2,17794,6,1
6,2,40141,7,1
7,2,1819,8,1
8,2,43668,9,0
9,3,33754,1,1


In [16]:
df_aisles = pd.read_csv('../instacart-market-basket-analysis/aisles.csv')
df_depart = pd.read_csv('../instacart-market-basket-analysis/departments.csv')
df_prods  = pd.read_csv('../instacart-market-basket-analysis/products.csv')

In [17]:
df_aisles

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation
5,6,other
6,7,packaged meat
7,8,bakery desserts
8,9,pasta sauce
9,10,kitchen supplies


In [18]:
df_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
5,6,Dry Nose Oil,11,11
6,7,Pure Coconut Water With Orange,98,7
7,8,Cut Russet Potatoes Steam N' Mash,116,1
8,9,Light Strawberry Blueberry Yogurt,120,16
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7


In [19]:
def df_user_prod_metdata(pids):
    associated_pid = oids
    master = pd.DataFrame()
    for pid in associated_pid:
        master = master.append(df_prods.loc[df_prods['product_id'] == pid])
    return master