In [1]:
%matplotlib inline 

# Regression
***

## Table of Contents
***
* [Aim](#aim)
* [Code Setup](#code-setup)
* [Data Import](#data-import)

## Aim <a class="anchor" id="aim"></a>
***

The aim of this notebook is to perform Regression on the Instacart Data

## Code Setup <a class="anchor" id="code-setup"></a>
***

### Constants

In [2]:
instacart_dir = "instacart_2017_05_01/"
my_orders_file = "my_orders.csv"
products_file = "products.csv"
aisles_file = "aisles.csv"
departments_file = "departments.csv"
orders_products_prior_file = "my_order_products__prior.csv"
orders_products_train_file = "my_order_products__train.csv"

### Import Libraries

In [3]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 6]
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("paper")
from sklearn.model_selection import train_test_split

In [4]:
def read_csv_file(file_name):
    return pd.read_csv(instacart_dir + file_name)

In [5]:
def replace_ws(the_string):
    return str(the_string).replace(" ", "_")

In [6]:
def add_days_to_next_order(order_df):
    orders = order_df.copy()
    orders["days_to_next_order"] = np.nan
    for index, cur_order in orders.iterrows():
        next_order_number = cur_order.order_number + 1
        user_id = cur_order.user_id
        next_order_df = orders.query("user_id == " + str(user_id) + " and  order_number == " + str(next_order_number))
        if(next_order_df.shape[0] == 1):
            next_order = next_order_df.iloc[0]
            days_to_next_order = next_order.days_since_prior_order
            if(np.isfinite(days_to_next_order)):
                orders.loc[index, 'days_to_next_order'] = days_to_next_order
    return orders

In [7]:
def add_each_aisle_to_df(df):
    df_aisles = df.copy().merge(aisles, on="aisle_id")
    for index, row in aisles.iterrows():
        aisle_name = replace_ws(row.aisle)
        df_aisles[aisle_name] = 0
    return df_aisles

In [8]:
def group_aisle_in_df(df_aisles):
    # Create new df with only the column names
    df = df_aisles.copy()
    df.drop(df.index, inplace=True)
    order_ids = df_aisles.order_id.unique().tolist()
    for order_id in order_ids:
        temp_df = df_aisles.query("order_id == " + str(order_id))
        # Need a copy as not to modify something we are iterarting over
        first_row = temp_df.copy().head(1)
        for i, temp_row in temp_df.iterrows():
            temp_aisle_name = replace_ws(temp_row.aisle)
            # If the aisle is present, set it to 1
            first_row[temp_aisle_name] = 1
        # Add the aggregated entry
        df = pd.concat([df, first_row])
    df = df.drop('aisle', axis=1)
    df = df.drop('aisle_id', axis=1)
    return df    

# USE THE AISLE AS THE PRODUCT ROLL UP AND ALSO USE REGULARISATION TO REMOVE AISLE WITH COMMON CORRELATION FOR PCA REDUCTION THIS SHOULD BE ENOUGH

### Import Data

#### Test and training split for all users

In [9]:
test_percentage = 0.2
orders = read_csv_file(my_orders_file)
orders = add_days_to_next_order(orders)
# Can use this to estimate what would be ordered
orders_with_no_next_order = orders[orders['days_to_next_order'].isnull()]
# Can split this into train and test
orders_with_next_order = orders[~orders['days_to_next_order'].isnull()]
train_orders, test_orders = train_test_split(orders_with_next_order, test_size=test_percentage)
print("Training size is :", train_orders.shape[0])
print("Testing size is :", test_orders.shape[0])

Training size is : 228
Testing size is : 58


#### Test and training split for specific user

In [10]:
count_orders = (orders
                .groupby("user_id")
                .size()
                .reset_index(name="count")
                .sort_values(by=['count'], ascending=False)
               )
# Get the most frequent user as this will have the most data associated with it
most_frequeny_user_id =  count_orders.iloc[0].user_id
single_user_orders = orders.query("user_id == " + str(most_frequeny_user_id))
user_train_orders, user_test_orders = train_test_split(single_user_orders, test_size=test_percentage)
print("User id for single investigation :", most_frequeny_user_id)
print("Training size for single user is :", user_train_orders.shape[0])
print("Testing size for single user is is :", user_test_orders.shape[0])

User id for single investigation : 47562
Training size for single user is : 71
Testing size for single user is is : 18


#### Import the Other Data

In [11]:
products = read_csv_file(products_file)
aisles = read_csv_file(aisles_file)
orders_products_prior = read_csv_file(orders_products_prior_file)
orders_products_train = read_csv_file(orders_products_train_file)
orders_products = pd.concat([orders_products_prior, orders_products_train])

## Start

In [12]:
train_orders_prod = train_orders.merge(orders_products, on="order_id")
train_orders_prod = train_orders_prod.merge(products, on="product_id")
train_orders_prod.head()

Unnamed: 0,Unnamed: 0_x,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,days_to_next_order,Unnamed: 0_y,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,790836,2581720,47562,prior,39,4,8,2.0,4.0,24479261,37203,1,0,Flexible Straws,111,17
1,790836,2581720,47562,prior,39,4,8,2.0,4.0,24479262,40174,2,1,2% Reduced Fat Organic Milk,84,16
2,790839,3024335,47562,prior,42,3,8,2.0,5.0,28676712,40174,9,1,2% Reduced Fat Organic Milk,84,16
3,790816,2606575,47562,prior,19,1,8,5.0,2.0,24715429,40174,1,1,2% Reduced Fat Organic Milk,84,16
4,790800,213516,47562,prior,3,4,7,2.0,5.0,2024963,40174,2,1,2% Reduced Fat Organic Milk,84,16


In [13]:
# NOW WE WANT TO ROLL UP BY AISLE

In [14]:
wanted_info = train_orders_prod[["order_id", "user_id", "days_to_next_order", "aisle_id"]]

In [15]:
df_with_aisle_info = add_each_aisle_to_df(wanted_info)
df_with_aisle_info.sort_values("order_id", ascending=False).head()

Unnamed: 0,order_id,user_id,days_to_next_order,aisle_id,aisle,prepared_soups_salads,specialty_cheeses,energy_granola_bars,instant_foods,marinades_meat_preparation,...,trail_mix_snack_mix,feminine_care,body_lotions_soap,tortillas_flat_bread,frozen_appetizers_sides,hot_cereal_pancake_mixes,dry_pasta,beauty,muscles_joints_pain_relief,specialty_wines_champagnes
761,3411808,70894,8.0,83,fresh vegetables,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
816,3411808,70894,8.0,83,fresh vegetables,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1318,3411808,70894,8.0,49,packaged poultry,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
293,3411808,70894,8.0,24,fresh fruits,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1489,3411808,70894,8.0,108,other creams cheeses,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df_aisle_group = group_aisle_in_df(df_with_aisle_info)
df_aisle_group.head()

Unnamed: 0,order_id,user_id,days_to_next_order,prepared_soups_salads,specialty_cheeses,energy_granola_bars,instant_foods,marinades_meat_preparation,other,packaged_meat,...,trail_mix_snack_mix,feminine_care,body_lotions_soap,tortillas_flat_bread,frozen_appetizers_sides,hot_cereal_pancake_mixes,dry_pasta,beauty,muscles_joints_pain_relief,specialty_wines_champagnes
0,2581720,47562,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,291531,83908,9.0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
3,3024335,47562,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2606575,47562,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,213516,47562,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# NOW WE WANT TO USE PCA/REGULARISATION AND REMOVE SOME UNNEEDED INFO/AISLES