# 01. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing data

In [2]:
# Telling Python to remember a main folder path
path=r'/Users/dariaperestiuk/Documents/02_02_24 Instacart Basket Analysis'

In [3]:
# Import Orderds using the os library
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '4', 'orders.csv'), index_col = False)

In [27]:
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [4]:
# Import Products
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '4', 'products.csv'), index_col = False)

# 03. Data Consistency Checks

In [6]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


## Mixed-Type Data

In [7]:
# Create a dataframe
df_test = pd.DataFrame()

In [9]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [10]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [31]:
# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

In [13]:
# Fix mixed-type column
df_test['mix'] = df_test['mix'].astype('str')

## Missing Values

In [14]:
# Function is used to find missing vallues
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [15]:
# Create a subset containing only missing values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [17]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


## Addressing Missing Values

In [18]:
df_prods.shape

(49693, 5)

In [19]:
# 1st way. Create a subset containing only non-missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [20]:
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


## Duplicate values

In [21]:
# Looking for full duplicates within the df_prods_clean
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [22]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [23]:
# Deleting the duplicate values by creating a new dataframe
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [24]:
df_prods_clean_no_dups.shape

(49672, 5)

# 04. Task Procedures

In [25]:
# 2. Check df_ords dataframe for inconsistencies
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### order_number has min and max values as 1. When 25% and 75% values are 5 and 2.
#### days_since_prior_order column has min 0 and max 3. Where as median is 7. There is a discrepancy in max value here.

In [29]:
# 3. Check for mixed-type data in df_ords dataframe
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

#### No mixed-type data was found.

In [32]:
# 5. Run a check for missing values in your df_ords dataframe.
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

#### There are 206,209 missing values in the 'days_since_prior_order' column. However, it might be customers who made only one order.
#### It's not safe to delete such a large amount of values.
#### Here are the steps of how I plan to address the issue:
1. Checking the frequency of missing values is the same as the frequency of order_number=1.
2. Create a new column that flags new orders.

In [35]:
# Create a subset containing only missing values
df_nan_ords = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [36]:
df_nan_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
11,2168274,2,prior,1,2,11,
26,1374495,3,prior,1,1,14,
39,3343014,4,prior,1,6,11,
45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...
3420930,969311,206205,prior,1,4,12,
3420934,3189322,206206,prior,1,3,18,
3421002,2166133,206207,prior,1,6,19,
3421019,2227043,206208,prior,1,1,15,


In [37]:
# Check the frequency of order_number=1 in df_nan
df_nan['order_number'].value_counts()

1    206209
Name: order_number, dtype: int64

In [38]:
# Check the frequency of order_number=1 in df_ords
df_ords['order_number'].value_counts()

1      206209
2      206209
3      206209
4      206209
5      182223
        ...  
96       1592
97       1525
98       1471
99       1421
100      1374
Name: order_number, Length: 100, dtype: int64

#### The frequency of missing values (206,209) is the same as the frequency of order_number=1. Create a new column that flags first orders.


In [41]:
# 7. Run a check for duplicate values in df_ords dataframe
df_ords_dups = df_ords[df_ords.duplicated()]

In [42]:
df_ords_dups

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


In [43]:
df_ords_dups.shape

(0, 7)

#### No duplicates found.

# 05. Exporting Dataframes

In [48]:
# Exporting the df_ords dataframe as “orders_cleaned.csv”
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_cleaned.csv'))

In [49]:
# Exporting the df_prods dataframe as “prods_cleaned.csv”
df_prods.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'prods_cleaned.csv'))