In [3]:
import pandas as pd
import numpy as np

# Load required libraries
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
#Main Dataframes
df_orders = pd.read_csv('../instacart-market-basket-analysis/orders.csv')
df_prod_p = pd.read_csv('../instacart-market-basket-analysis/order_products__prior.csv')

In [5]:
def get_uids():
    return list(df_orders['user_id'].drop_duplicates())

uids = get_uids()

In [6]:
print('Total number of users = {}\n'.format(len(uids)))
print('The first ten user ids (uid) = {}\n'.format(uids[:10]))

Total number of users = 206209

The first ten user ids (uid) = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]



In [7]:
def get_chrono_oids_by_uid(uid, inc_train = False):
    if inc_train:
        associated_oid = list(df_orders.loc[(df_orders['user_id'] == uid)]['order_id'])
    else:
        associated_oid = list(df_orders.loc[(df_orders['user_id'] == uid) & (df_orders['eval_set'] == 'prior')]['order_id'])
    return list(associated_oid)

In [8]:
s_uid = uids[0]
s_oids = get_chrono_oids_by_uid(s_uid, True)
print("User {} has placed {} orders with the following oids:\n\t {}".format(s_uid,len(s_oids), s_oids))

User 1 has placed 11 orders with the following oids:
	 [2539329, 2398795, 473747, 2254736, 431534, 3367565, 550135, 3108588, 2295261, 2550362, 1187899]


In [9]:
def df_orders_by_uid(uid):
    return df_orders.loc[(df_orders['user_id'] == uid)]

In [29]:
sdf_orders = df_orders_by_uid(s_uid)
sdf_orders

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [44]:
xdf = sdf_orders.loc[sdf_orders['order_id'] == 2539329]
xdf

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,


In [59]:
def get_ord_metdata(udf):
    ord_metdat = dict()
    for oid in udf.drop(['user_id', 'eval_set'], axis=1).fillna(-1).values:
        ord_metdat[oid[0]] = oid[1:]
    return ord_metdat

In [65]:
def df_prod_by_uid(uid, oids):
    associated_oid = oids
    master = pd.DataFrame()
    for oid in associated_oid:
        master = master.append(df_prod_p.loc[df_prod_p['order_id'] == oid])
    return master

In [66]:
sdf_prod = df_prod_by_uid(s_uid, s_oids)
sdf_prod

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
24076664,2539329,196,1,0
24076665,2539329,14084,2,0
24076666,2539329,12427,3,0
24076667,2539329,26088,4,0
24076668,2539329,26405,5,0
22742744,2398795,196,1,1
22742745,2398795,10258,2,0
22742746,2398795,12427,3,1
22742747,2398795,13176,4,0
22742748,2398795,26088,5,1


In [13]:
def prod_info_from_df(sdf_prod):
    prod_set      = set()
    ord_prod_dict = dict()
    for i, rows in sdf_prod.iterrows():
        oid  = sdf_prod.loc[i,'order_id']
        pid  = sdf_prod.loc[i,'product_id']
        prod_set.add(pid)
        if ord_prod_dict.get(oid) == None:
            ord_prod_dict[oid] = [pid]
        else:
            ord_prod_dict[oid] = ord_prod_dict.get(oid) + [pid]
    return (prod_set, ord_prod_dict)

In [14]:
def df_from_sprod_info(sprod_info):
    key = sprod_info[0]
    val = [0 for x in range(len(key))]
    d = list()
    for order in sprod_info[1].keys():
        o_dict = dict(zip(key,val))
        pids = sprod_info[1][order]
        for pid in pids:
            o_dict[pid] = 1
        d.append(o_dict)
    p_info = pd.DataFrame(data = d, index = sprod_info[1].keys())
    p_info.index.name = "order_id"
    return p_info
            
        

In [15]:
sprod_info = prod_info_from_df(sdf_prod)
print("User {} has ordered each of the following {} products at least once:\n{}".format(s_uid, len(sprod_info[0]), sprod_info[0]))

User 1 has ordered each of the following 18 products at least once:
{17122, 196, 26405, 14084, 46149, 26088, 13032, 39657, 12427, 25133, 35951, 38928, 10258, 30450, 49235, 10326, 13176, 41787}


In [16]:
ordered_df = df_from_sprod_info(sprod_info)
ordered_df

Unnamed: 0_level_0,196,10258,10326,12427,13032,13176,14084,17122,25133,26088,26405,30450,35951,38928,39657,41787,46149,49235
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2539329,1,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0
2398795,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0
473747,1,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2254736,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0
431534,1,1,1,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0
3367565,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
550135,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
3108588,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1
2295261,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1
2550362,1,1,0,1,1,0,0,0,1,0,0,0,1,1,1,0,1,0


In [17]:
list(ordered_df.columns)
ordered_df.iloc[0,0:2]

196      1
10258    0
Name: 2539329, dtype: int64

In [18]:
def gen_training_data(df):
    s_pids = list(df.columns)
    features = list()
    labels = list()
    for i in range(len(s_pids)):
        feat = list(df[s_pids[i]])
        features.append(feat[0:-1])
        labels.append(feat[-1])
    return features, labels, s_pids
        
    
    

In [19]:
X, y, pids = gen_training_data(ordered_df)

In [22]:
print(pids)

[196, 10258, 10326, 12427, 13032, 13176, 14084, 17122, 25133, 26088, 26405, 30450, 35951, 38928, 39657, 41787, 46149, 49235]


In [60]:
# Split the data into 70% training data and 30% test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

# Create a perceptron object with the parameters: 40 iterations (epochs) over the data, and a learning rate of 0.1
ppn = Perceptron(max_iter=40, eta0=0.1, random_state=0)

# Train the perceptron
ppn.fit(X_train, y_train)

# Apply the trained perceptron on the X data to make predicts for the y test data
y_pred = ppn.predict(X_test)

# View the predicted y test data
y_pred

# View the true y test data
y_test

# View the accuracy of the model, which is: 1 - (observations predicted wrong / total observations)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

Accuracy: 0.00


In [62]:
df_aisles = pd.read_csv('../instacart-market-basket-analysis/aisles.csv')
df_depart = pd.read_csv('../instacart-market-basket-analysis/departments.csv')
df_prods  = pd.read_csv('../instacart-market-basket-analysis/products.csv')

In [78]:
#New Function Definitions for Week 9
def get_ord_metdata(udf):
    ord_metdat = dict()
    for oid in udf.drop(['user_id', 'eval_set'], axis=1).fillna(-1).values:
        ord_metdat[oid[0]] = oid[1:]
    return ord_metdat

def df_user_prod(pids):
    associated_pid = pids
    master = pd.DataFrame()
    for pid in associated_pid:
        master = master.append(df_prods.loc[df_prods['product_id'] == pid])
    return master

def get_pid_metdata(udf):
    pid_metdat = dict()
    for pid in udf.drop(['product_name'], axis=1).fillna(-1).values:
        pid_metdat[pid[0]] = pid[1:]
    return pid_metdat

def get_xids_by_uid(uid):
    oids = get_chrono_oids_by_uid(s_uid, True)
    df_op = df_prod_by_uid(uid, oids)
    pids = list(df_op['product_id'].unique())
    return oids, pids

In [81]:
o , p = get_xids_by_uid(0)
df_user_prod(p)

Unnamed: 0,product_id,product_name,aisle_id,department_id
195,196,Soda,77,7
14083,14084,Organic Unsweetened Vanilla Almond Milk,91,16
12426,12427,Original Beef Jerky,23,19
26087,26088,Aged White Cheddar Popcorn,23,19
26404,26405,XL Pick-A-Size Paper Towel Rolls,54,17
10257,10258,Pistachios,117,19
13175,13176,Bag of Organic Bananas,24,4
13031,13032,Cinnamon Toast Crunch,121,14
25132,25133,Organic String Cheese,21,16
30449,30450,Creamy Almond Butter,88,13
