# Contents
## Importing Libraries and Data (products.csv)
## Fixing Mixed-Type Data
## Dealing with Missing Values
## Checking for Duplicates
## Exporting Changes
## Task 4.5 (repeating the steps above on orders_wrangled.csv)

# 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [2]:
#Defining folder location as 'path'
path = r'C:\Users\davau\OneDrive - College of the Sequoias\Career Foundry\Data Immersion\Achievement 4 (Python)\Instacart Basket Analysis'

In [3]:
#Importing products.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
#Importing orders_wrangled.csv
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [5]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [6]:
df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
1,1,2398795,1,prior,2,3,7,15.0
2,2,473747,1,prior,3,3,12,21.0
3,3,2254736,1,prior,4,4,7,29.0
4,4,431534,1,prior,5,4,15,28.0


# 3. Fixing Mixed-Type Data

In [7]:
# Create a dataframe
df_test = pd.DataFrame()

In [8]:
# Create a mixed type column

df_test['mix'] = ['a', 'b', 1, True]

In [9]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [10]:
# Checking df_test for columns with mixed data

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [11]:
# Assigning the 'mix' column the string variable type

df_test['mix'] = df_test['mix'].astype('str')

# 4. Missing Values

In [12]:
# Checking df_prods for missing values

df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [13]:
# Note that 'product_name' has 16 missing values.
# Creating a subset of df_prods containing only rows with missing product names

df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [14]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [15]:
# Since the missing values are product names, cannot impute.
# Instead, we will create a new subset dataframe that filters out these rows.
# First, let's see how many rows and columns we have in the current df.

df_prods.shape

(49693, 5)

In [16]:
# Next, create the new subset dataframe

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [17]:
# Now check the number of rows and columns in the subset dataframe

df_prods_clean.shape

(49677, 5)

# 5. Duplicates

In [18]:
# Looking for full duplicates

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [19]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [20]:
# Checking number of rows and columns in original df

df_prods_clean.shape

(49677, 5)

In [21]:
# Creating a new df that doesn't include duplicates

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [22]:
# Checking number of rows and columns in unduplicated df

df_prods_clean_no_dups.shape

(49672, 5)

# 6. Exporting Changes

In [23]:
# Exporting df_prods_clean_no_dups

df_prods_clean_no_dups.to_csv(os.path.join(path,'02 Data','Prepared Data','products_checked.csv'))

# Task 4.5

## Step 2

In [24]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


The first thing I notice is that 'days_since_prior_order' has some missing values (its count is lower than the other columns).  Otherwise, everything seems to check out.

## Step 3

In [25]:
# Checking df_ords for mixed data

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

No mixed data was found!

## Step 5

In [26]:
# Checking df_ords for missing values

df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

As noted above, 'days_since_prior_order' has missing values (206,209 to be exact).  One explanation for this is that these may be the first order that the customer has placed, and therefore, the customer has no prior order.  To double-check this assumption, I can take a closer look.

In [27]:
# Creating a subset of df_ords containing only rows with missing product names

df_nan_ords = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [28]:
df_nan_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
11,11,2168274,2,prior,1,2,11,
26,26,1374495,3,prior,1,1,14,
39,39,3343014,4,prior,1,6,11,
45,45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,prior,1,4,12,
3420934,3420934,3189322,206206,prior,1,3,18,
3421002,3421002,2166133,206207,prior,1,6,19,
3421019,3421019,2227043,206208,prior,1,1,15,


Note that the order number for each of these is 1, confirming my suspicion.

## Step 6

I will leave the missing values as they are.  I thought about imputing 0 for these rows, but that wouldn't be appropriate, as that would indicate that the customer had placed an order earlier that day, which isn't true.  Best to just leave the dataframe as is.  An order_number of 1 will serve as an appropriate flag for rows containing missing values (new customers).

## Step 7

In [29]:
# Checking df_ords for duplicates

df_dups_ords = df_ords[df_ords.duplicated()]

In [30]:
df_dups_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


No duplicate rows were found!

## Step 9

In [31]:
# Exporting df_ords
df_ords.to_csv(os.path.join(path,'02 Data','Prepared Data','orders_checked.csv'))