In [4]:
# Load required libraries
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
# Global Dataframes from Original CSVs
df_orders = pd.read_csv('../instacart-market-basket-analysis/orders.csv')
df_prior = pd.read_csv('../instacart-market-basket-analysis/order_products__prior.csv')
df_aisles = pd.read_csv('../instacart-market-basket-analysis/aisles.csv')
df_depart = pd.read_csv('../instacart-market-basket-analysis/departments.csv')
df_prods  = pd.read_csv('../instacart-market-basket-analysis/products.csv')

In [241]:
# Basic Functions 
def get_uids():
    return list(df_orders['user_id'].drop_duplicates())

# User-id Specific Functions
def get_xids_by_uid(uid):
    oids = get_oids_by_uid(uid, True)
    df_op = df_prior_by_uid(uid, oids)
    pids = list(df_op['product_id'].unique())
    return oids, pids

def get_oids_by_uid(uid, inc_train = False):
    if inc_train:
        associated_oid = list(df_orders.loc[(df_orders['user_id'] == uid)]['order_id'])
    else:
        associated_oid = list(df_orders.loc[(df_orders['user_id'] == uid) & (df_orders['eval_set'] == 'prior')]['order_id'])
    return list(associated_oid)

# Meta-Data Dictionaries from User-Specific Data Frames
def get_pid_metadata(udf):
    pid_metadata = dict()
    for pid in udf.drop(['product_name'], axis=1).fillna(-1).values:
        pid_metadata[pid[0]] = pid[1:]
    return pid_metadata

def get_ord_metadata(udf):
    ord_metadata = dict()
    for oid in udf.drop(['user_id', 'eval_set'], axis=1).fillna(-1).values:
        ord_metadata[oid[0]] = oid[1:]
    return ord_metadata

def ord_prod_metadata(udf_prior):
    ord_prod_dict = dict()
    for i, rows in udf_prior.iterrows():
        oid  = udf_prior.loc[i,'order_id']
        pid  = udf_prior.loc[i,'product_id']
        if ord_prod_dict.get(oid) == None:
            ord_prod_dict[oid] = [pid]
        else:
            ord_prod_dict[oid] = ord_prod_dict.get(oid) + [pid]
    return (ord_prod_dict)

# User-Specific Dataframes
def df_orders_by_uid(uid):
    return df_orders.loc[(df_orders['user_id'] == uid)]

def df_prods_by_uid(uid, pids = list()):
    if len(pids) == 0:
        pids = get_xids_by_uid(uid)[1]
    associated_pid = pids
    master = pd.DataFrame()
    for pid in associated_pid:
        master = master.append(df_prods.loc[df_prods['product_id'] == pid])
    return master

def df_prior_by_uid(uid, oids = list()):
    if len(oids) == 0:
        oids = get_xids_by_uid(uid)[0]
    associated_oid = oids
    master = pd.DataFrame()
    for oid in associated_oid:
        master = master.append(df_prior.loc[df_prior['order_id'] == oid])
    return master

#User-Specific Feature Vector Dataframes
def df_oid_fv_by_uid(uid, pids=list(),opd = dict()):
    if len(pids) == 0:
        pids = get_xids_by_uid(uid)[1]
    if len(opd.keys()) == 0:
        opd = ord_prod_metadata(df_prior_by_uid(uid))
    sprod_info = (pids,opd)
    key = sprod_info[0]
    val = [-1 for x in range(len(key))]
    d = list()
    
    #Order_ID
    for order in sprod_info[1].keys():
        o_dict = dict(zip(key,val))
        pids = sprod_info[1][order]
        for pid in pids:
            o_dict[pid] = 1
        d.append(o_dict)
    p_info = pd.DataFrame(data = d, index = sprod_info[1].keys())
    p_info.index.name = "order_id"
    return p_info

def get_feat_index(feat_str):
    feat_index_dict = {
        'num'  : 0,
        'dow'  : 1,
        'hod'  : 2,
        'dspo' : 3,
        'aisle': 0,
        'dep'  : 1}
    return feat_index_dict.get(feat_str)

def get_feat_dict(uid, feat_str):
    if feat_str in ['num','dow','hod','dspo']:
        return get_ord_metadata(df_orders_by_uid(uid))
    elif feat_str in ['aisle', 'dep']:
        return get_pid_metadata(df_prods_by_uid(uid))

def df_oxx_fv_by_uid(uid, feat,pids=list(),opd = dict()):
    index = get_feat_index(feat)
    if len(pids) == 0:
        pids = get_xids_by_uid(uid)[1]
    if len(opd.keys()) == 0:
        opd = ord_prod_metadata(df_prior_by_uid(uid))
    sprod_info = (pids,opd)
    key = sprod_info[0]
    val = [-1 for x in range(len(key))]
    d = list()

    #Order_XX
    xxs = [str(oid)+"_"+feat for oid in sprod_info[1].keys()]
    
    omd = get_feat_dict(uid,feat)
    for i in range(len(xxs)):
        order = list(sprod_info[1].keys())[i]
        o_dict = dict(zip(key,val))
        pids = sprod_info[1][order]
        for pid in pids:
            o_dict[pid] = omd[order][index]
        d.append(o_dict)
    new_index = list(sprod_info[1].keys())+xxs

    p_info = pd.DataFrame(data = d, index = xxs)
    p_info.index.name = "order_"+feat
    return p_info

def df_onum_fv_by_uid(uid, pids=list(),opd = dict()):
    return df_oxx_fv_by_uid(uid,'num')

def df_odow_fv_by_uid(uid, pids=list(),opd = dict()):
    return df_oxx_fv_by_uid(uid,'dow')

def df_ohod_fv_by_uid(uid, pids=list(),opd = dict()):
    return df_oxx_fv_by_uid(uid,'hod')

def df_odspo_fv_by_uid(uid, pids=list(),opd = dict()):
    return df_oxx_fv_by_uid(uid,'dspo')

def df_paisle_fv_by_uid(uid):
    df_fv = pd.DataFrame(get_pid_metadata(df_prods_by_uid(1)), index=['p_aisle', 'p_dep'])
    return df_fv.drop(['p_dep'])

def df_pdep_fv_by_uid(uid):
    df_fv = pd.DataFrame(get_pid_metadata(df_prods_by_uid(1)), index=['p_aisle', 'p_dep'])
    return df_fv.drop(['p_aisle'])

In [183]:
df_odow_fv_by_uid(1)

Unnamed: 0_level_0,196,10258,10326,12427,13032,13176,14084,17122,25133,26088,26405,30450,35951,38928,39657,41787,46149,49235
order_dow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2539329_dow,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2398795_dow,3.0,3.0,-1.0,3.0,3.0,3.0,-1.0,-1.0,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
473747_dow,3.0,3.0,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,3.0,-1.0,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2254736_dow,4.0,4.0,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,4.0,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
431534_dow,4.0,4.0,4.0,4.0,-1.0,4.0,-1.0,4.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.0,-1.0,-1.0
3367565_dow,2.0,2.0,-1.0,2.0,-1.0,-1.0,-1.0,-1.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
550135_dow,1.0,1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3108588_dow,1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0
2295261_dow,1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0
2550362_dow,4.0,4.0,-1.0,4.0,4.0,-1.0,-1.0,-1.0,4.0,-1.0,-1.0,-1.0,4.0,4.0,4.0,-1.0,4.0,-1.0


In [167]:
df_oxx_fv_by_uid(1, 'dspo')

Unnamed: 0_level_0,196,10258,10326,12427,13032,13176,14084,17122,25133,26088,26405,30450,35951,38928,39657,41787,46149,49235
order_dspo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2539329_dspo,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2398795_dspo,15.0,15.0,-1.0,15.0,15.0,15.0,-1.0,-1.0,-1.0,15.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
473747_dspo,21.0,21.0,-1.0,21.0,-1.0,-1.0,-1.0,-1.0,21.0,-1.0,-1.0,21.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2254736_dspo,29.0,29.0,-1.0,29.0,-1.0,-1.0,-1.0,-1.0,29.0,-1.0,29.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
431534_dspo,28.0,28.0,28.0,28.0,-1.0,28.0,-1.0,28.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,28.0,-1.0,-1.0
3367565_dspo,19.0,19.0,-1.0,19.0,-1.0,-1.0,-1.0,-1.0,19.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
550135_dspo,20.0,20.0,-1.0,20.0,20.0,-1.0,-1.0,-1.0,20.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3108588_dspo,14.0,14.0,-1.0,14.0,-1.0,-1.0,-1.0,-1.0,14.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,14.0,14.0
2295261_dspo,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0
2550362_dspo,30.0,30.0,-1.0,30.0,30.0,-1.0,-1.0,-1.0,30.0,-1.0,-1.0,-1.0,30.0,30.0,30.0,-1.0,30.0,-1.0


In [136]:
get_ord_metadata(df_orders_by_uid(1))

{431534.0: array([ 5.,  4., 15., 28.]),
 473747.0: array([ 3.,  3., 12., 21.]),
 550135.0: array([ 7.,  1.,  9., 20.]),
 1187899.0: array([11.,  4.,  8., 14.]),
 2254736.0: array([ 4.,  4.,  7., 29.]),
 2295261.0: array([ 9.,  1., 16.,  0.]),
 2398795.0: array([ 2.,  3.,  7., 15.]),
 2539329.0: array([ 1.,  2.,  8., -1.]),
 2550362.0: array([10.,  4.,  8., 30.]),
 3108588.0: array([ 8.,  1., 14., 14.]),
 3367565.0: array([ 6.,  2.,  7., 19.])}

In [242]:
def gen_training_data(uid,df):
    o_df = df_oid_fv_by_uid(uid)
    s_pids = list(df.columns)
    features = list()
    o_s_pids = list(o_df.columns)
    o_labels = list()
    for i in range(len(s_pids)):
        feat = list(df[s_pids[i]])
        features.append(feat[0:-1])
        
        o_feat = list(o_df[s_pids[i]])
        o_labels.append(o_feat[-1])
        
    return features, o_labels, s_pids
    

def gen_prediction_result(X, y, pids):
    # Split the data into 70% training data and 30% test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    # Create a perceptron object with the parameters: 40 iterations (epochs) over the data, and a learning rate of 0.1
    model = Perceptron(max_iter = 40, eta0=0.1, random_state=0)

    # Train the perceptron
    model.fit(X_train, y_train)

    # Apply the trained perceptron on the X data to make predicts for the y test data
    y_pred = model.predict(X_test)

    # View the predicted y test data
    y_pred

    # View the true y test data
    y_test

    return accuracy_score(y_test, y_pred)

def gen_user_prediction(uid,feats):
    df_feat_vecs = list()
    for feat in feats:
        if feat == 'ordered':
            df_feat_vecs.append(df_oid_fv_by_uid(uid))
        elif feat == 'num':
            df_feat_vecs.append(df_onum_fv_by_uid(uid))
        elif feat == 'dow':
            df_feat_vecs.append(df_odow_fv_by_uid(uid))
        elif feat == 'hod':
            df_feat_vecs.append(df_ohod_fv_by_uid(uid)) 
        elif feat == 'dspo':
            df_feat_vecs.append(df_odspo_fv_by_uid(uid))
        elif feat == 'aisle':
            df_feat_vecs.append(df_paisle_fv_by_uid(uid))
        elif feat == 'dep':
            df_feat_vecs.append(df_pdep_fv_by_uid(uid))
    
    df_combined_fv = pd.concat(df_feat_vecs)
    
    X, y, pids = gen_training_data(uid,df_combined_fv)
    
    return gen_prediction_result(X, y, pids)

In [218]:
gen_user_prediction(13,['num'])

0.6666666666666666

In [208]:
gen_user_prediction(1,['ordered'])

1.0

In [209]:
gen_training_data(1,df_odow_fv_by_uid(1))

([[2.0, 3.0, 3.0, 4.0, 4.0, 2.0, 1.0, 1.0, 1.0],
  [-1.0, 3.0, 3.0, 4.0, 4.0, 2.0, 1.0, 1.0, 1.0],
  [-1.0, -1.0, -1.0, -1.0, 4.0, -1.0, -1.0, -1.0, -1.0],
  [2.0, 3.0, 3.0, 4.0, 4.0, 2.0, 1.0, 1.0, 1.0],
  [-1.0, 3.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0],
  [-1.0, 3.0, -1.0, -1.0, 4.0, -1.0, -1.0, -1.0, -1.0],
  [2.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
  [-1.0, -1.0, -1.0, -1.0, 4.0, -1.0, -1.0, -1.0, -1.0],
  [-1.0, -1.0, 3.0, 4.0, 4.0, 2.0, 1.0, 1.0, 1.0],
  [2.0, 3.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
  [2.0, -1.0, -1.0, 4.0, -1.0, -1.0, -1.0, -1.0, -1.0],
  [-1.0, -1.0, 3.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
  [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
  [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
  [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
  [-1.0, -1.0, -1.0, -1.0, 4.0, -1.0, -1.0, -1.0, -1.0],
  [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0],
  [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0]],
 [1, 

In [223]:
pd.DataFrame(get_pid_metadata(df_prods_by_uid(1)))

Unnamed: 0,196,10258,10326,12427,13032,13176,14084,17122,25133,26088,26405,30450,35951,38928,39657,41787,46149,49235
0,77,117,24,23,121,24,91,24,21,23,54,88,91,120,45,24,77,53
1,7,19,4,19,14,4,16,4,16,19,17,13,16,16,19,4,7,16


In [240]:


df_paisle_fv_by_uid(1)

Unnamed: 0,196,10258,10326,12427,13032,13176,14084,17122,25133,26088,26405,30450,35951,38928,39657,41787,46149,49235
p_aisle,77,117,24,23,121,24,91,24,21,23,54,88,91,120,45,24,77,53
