# Table of contents

## 01 Importing libraries

## 02 Importing data

## 03 Checking data consistency
### 03.1 Addressing mixed-type variables
### 03.2 Addressing missing values
### 03.3 Addressing duplicates

## 04 Exporting checked dataframe

## 05 Task for Exercise 4.5: Data consistency checks
### Step 1: Checking df_prods_clean_no_dups
### Step 2: Exploring df_ords
### Step 3: Checking for mixed-type columns in df_ords
### Step 5: Checking for missing values in df_ords
### Step 6: Addressing missing values in df_ords
### Step 7: Checking for duplicate values in df_ords
### Step 9: Exporting dataframes

# 01 Importing libraries

In [1]:
# Importing pandas, NumPy and os
import pandas as pd
import numpy as np
import os

# 02 Importing data

In [2]:
# Defining path variable
path = r'/Users/DanielaDietmayr/Library/CloudStorage/OneDrive-Personal/2023-01-18 Instacart basket analysis'

In [3]:
# Importing products data
df_prods = pd.read_csv(os.path.join (path, '02 Data', '01 Original data', 'products.csv'), index_col=False)

In [4]:
# Importing orders data
df_ords = pd.read_csv(os.path.join (path, '02 Data', '02 Prepared data', 'orders_wrangled.csv'), index_col=False)

In [5]:
# Checking products data import
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [6]:
# Checking orders data import
df_ords.head()

Unnamed: 0,order_id,user_id,order_sequence_per_user,order_day_of_the_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# 03 Checking data consistency

In [7]:
# Exploring df_ords
df_ords.describe()

Unnamed: 0,order_id,user_id,order_sequence_per_user,order_day_of_the_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


## 03.1 Addressing mixed-type variables

In [8]:
# Creating a dataframe
df_test = pd.DataFrame()

In [9]:
# Creating a mixed-type column
df_test['mix'] = ('a', 'b', 1, True)

In [10]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [11]:
# Checking for mixed-type columns
for col in df_test.columns.tolist():
    weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any (axis = 1)
    if len (df_test[weird]) > 0:
        print (col)

mix


In [12]:
# Addressing mixed type columns
df_test['mix'] = df_test['mix'].astype('str')

In [13]:
# Double-checking for mixed-type columns
for col in df_test.columns.tolist():
    weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any (axis = 1)
    if len (df_test[weird]) > 0:
        print (col)

## 03.2 Addressing missing values

In [14]:
# Checking for missing values in df_prods
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [15]:
# Creating subset with missing values in product_name
df_nan = df_prods[df_prods['product_name'].isnull()==True]

In [16]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [17]:
# Counting number of rows before removing missing values
df_prods.shape

(49693, 5)

In [18]:
# Creating new dataframe with missing values removed
df_prods_clean = df_prods[df_prods['product_name'].isnull()==False]

In [19]:
# Counting number or rows in new dataframe
df_prods_clean.shape

(49677, 5)

## 03.3 Addressing duplicates

In [20]:
# Locating duplicate values in df_prods
df_prods_dups = df_prods[df_prods.duplicated()]

In [21]:
# Printing duplicate values in df_prods
df_prods_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [22]:
# Checking shape of df_prods_clean with duplicates
df_prods_clean.shape

(49677, 5)

In [23]:
# Creating duplicate-free dataframe
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [24]:
#Checking shape of duplicate-free dataframe
df_prods_clean_no_dups.shape

(49672, 5)

# 04 Exporting checked dataframe

In [25]:
# Exporting df_prods_clean_no_dups
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', '02 Prepared data', 'products_checked.csv'), index=False)

# 05 Task for Exercise 4.5: Data consistency checks

## Step 1: Checking df_prods_clean_no_dups

In [26]:
# Exploring df_prods_clean_no_dups
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [27]:
df_prods_clean_no_dups[df_prods_clean_no_dups.prices == 1]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
235,236,Chicken Meatballs Dog Treats,40,8,1.0
1150,1150,"AFC Sushi Spicy Salmon Roll Prepared In Store,...",13,20,1.0
1330,1330,Extra Soft Double Roll Bathroom Tissue,54,17,1.0
1532,1532,Deluxe Macaroni & Cheese Dinner,131,9,1.0
1702,1702,Original Coconut Milk,91,16,1.0
...,...,...,...,...,...
46591,46587,Pearl Couscous Natural,63,9,1.0
46627,46623,"Pie Pans, Large",10,17,1.0
47294,47290,Honey Goat Milk Cheese,21,16,1.0
48158,48154,Unsweetened Almond Coconut Milk Blend,91,16,1.0


In [28]:
df_prods_clean_no_dups[df_prods_clean_no_dups.prices == 99999]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


From comparing the count for all columns, I can see that there are no missing values in any column as they have been removed in the steps carried out above. 
The minimum and the maximum in the prices column are interesting. On the one hand, I would expect at least one product that costs less than 1$. On the other hand, the maximum price of 99.999$ looks a bit to high for an online grocery company. 
If I was able to reach out to Instacart, I would export the observations with the minimum prices and ask them for further explanation. As the observation with the maximum price is only one and it is safe to assume that reduced fat milk does not cost 99.999$, I delete this observation by creating a new dataframe df_prods_check_complete.

In [29]:
# Checking for other observations with identical product_id
df_prods_clean_no_dups[df_prods_clean_no_dups.product_id == 33664]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


In [30]:
# Checking dimensions of df_prods_clean_no_dups
df_prods_clean_no_dups.shape

(49672, 5)

In [31]:
# Deleting inaccurate observation
df_prods_check_complete = df_prods_clean_no_dups.query('prices != 99999')

In [32]:
# Checking dimensions of dataframe without inaccurate observation
df_prods_check_complete.shape

(49671, 5)

In [33]:
# Exploring dataframe without inaccurate observation
df_prods_check_complete.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49671.0,49671.0,49671.0,49671.0
mean,24850.172334,67.762115,11.728856,7.980256
std,14340.795118,38.3161,5.850806,66.952504
min,1.0,1.0,1.0,1.0
25%,12432.5,35.0,7.0,4.1
50%,24850.0,69.0,13.0,7.1
75%,37268.5,100.0,17.0,11.1
max,49688.0,134.0,21.0,14900.0


In [34]:
# Checking product with maximum price
df_prods_check_complete[df_prods_check_complete.prices== 14900]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0


Looking at the product that now has the highest price in the dataset, I begin to wonder whether there's a problem with the prices of lowfat products. I refrain from deleting further observations. Instead, I would reach out to Instacart to inquire whether there is something wrong with the prices for certain products in the dataset.
As I cannot actually reach out to Instacart, I decide to not export the dataframe df_prods_check_complete with the deleted observation, but rather retain the export df_prods_clean_no_dups that includes the observation with a price of 99.999$

## Step 2: Exploring df_ords

In [35]:
# Exploring df_ords
df_ords.describe()

Unnamed: 0,order_id,user_id,order_sequence_per_user,order_day_of_the_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


From comparing the count for all columns, I can deduce that there are missing values in days_since_prior_order.
The maximum for order_sequence_per_user is 100. While it could be that some users have ordered as often as 100 time, this could be worth investigating furter to ensure the data is accurate.
If I was able to reach out to Instacart, I would export the observations for the user or users with very high order_sequence_per_user and ask them for further explanation.

## Step 3: Checking for mixed-type columns in df_ords

In [36]:
# Printing all mixed-type columns in df_ords
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any (axis = 1)
    if len (df_ords[weird]) > 0:
        print(col)

No mixed-type columns found in df_ords, step 4 can therefore be omitted.

## Step 5: Checking for missing values in df_ords

In [37]:
# Finding missing values
df_ords.isnull().sum()

order_id                        0
user_id                         0
order_sequence_per_user         0
order_day_of_the_week           0
order_hour_of_day               0
days_since_prior_order     206209
dtype: int64

In [38]:
# Creating a subset with observations with missing values in days_since_prior_order
df_ords_missing_prior_order = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [39]:
# Printing subset
df_ords_missing_prior_order

Unnamed: 0,order_id,user_id,order_sequence_per_user,order_day_of_the_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


In [40]:
# Exploring subset
df_ords_missing_prior_order.describe()

Unnamed: 0,order_id,user_id,order_sequence_per_user,order_day_of_the_week,order_hour_of_day,days_since_prior_order
count,206209.0,206209.0,206209.0,206209.0,206209.0,0.0
mean,1708462.0,103105.0,1.0,2.754118,13.626597,
std,988129.9,59527.555167,0.0,2.076205,4.223769,
min,20.0,1.0,1.0,0.0,0.0,
25%,850730.0,51553.0,1.0,1.0,11.0,
50%,1706246.0,103105.0,1.0,3.0,14.0,
75%,2564292.0,154657.0,1.0,5.0,17.0,
max,3421081.0,206209.0,1.0,6.0,23.0,


Seeing that in the subset of observations with missing data in days_since_prior_order all observations have a value of 1 in the order_sequence_per_user, I assume that the observations are valid. The order processing system simply assigned a value of 0 to orders of first-time users. The data dictionary available on github confirms this by outlining that days_since_prior ar the days since the last order, capped at 30 (with NAs for order_number = 1).

## Step 6: Addressing missing values in df_ords

In [41]:
# Imputing missing values in df_ords
df_ords['days_since_prior_order'].fillna(0, inplace = True)

Against the background of the information given in the data dictionary, the most adequate method to address the missing values is imputation of a value of 0 in the column days_since_prior_order.

In [42]:
# Double-checking df_ords after imputation
df_ords.describe()

Unnamed: 0,order_id,user_id,order_sequence_per_user,order_day_of_the_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,10.44488
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.308727
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [43]:
# Further double-checking df_ords after imputation
df_ords.isnull().sum()

order_id                   0
user_id                    0
order_sequence_per_user    0
order_day_of_the_week      0
order_hour_of_day          0
days_since_prior_order     0
dtype: int64

Checks confirm that imputation was successful

## Step 7: Checking for duplicate values in df_ords

In [44]:
# Creating subset of duplicate values in df_ords
df_ords_dups = df_ords[df_ords.duplicated()]

In [45]:
# Printing subset of duplicate values in df_ords
df_ords_dups

Unnamed: 0,order_id,user_id,order_sequence_per_user,order_day_of_the_week,order_hour_of_day,days_since_prior_order


There are no full duplicates in df_ords, hence no need to address any duplicate values. Step 8 of the task is therefore omitted.

## Step 9: Exporting dataframes

In [46]:
# Exporting checked orders data
df_ords.to_csv(os.path.join(path, '02 Data', '02 Prepared data', 'orders_checked.csv'), index = False)

I decided not to export a new dataframe of the products data. That means my exported products dataframe was cleaned of duplicates, but it still contains observations with unrealistically high prices. I would be grateful for advice: should I investigate further and drop observations until the maximum price seems realistic? Or should I rather keep the unrealistically high prices in the dataset, but keep them in mind for my further analysis?