# 4.5: Data Consistency Checks

 01. Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

02. Importing Data

In [2]:
path = r'C:\Users\Frederick\Documents\CareerFoundry\CF - Data Analytics Immersion\Achievement4\04-2024 Instacart Basket Analysis'

In [3]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
df_ords = pd.read_csv(os.path.join(path,'02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [53]:
# Begin data consistency check by reviewing descriptive statistics

In [54]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,10.86681
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,8.978521
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,5.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [57]:
df_ords.drop(columns=['Unnamed: 0'], inplace =True)

In [58]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,10.86681
std,987581.7,59533.72,17.73316,2.046829,4.226088,8.978521
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,5.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [56]:
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


In [59]:
# Carefully analyzing your data and knowing what the columns are supposed to contain can help greatly when it comes to verifying your data’s consistency

03. Mixed-Type Data

In [5]:
# Instacart data is already preped so we will practice fixing mixed-type data by creating a small test dataframe

In [6]:
# Create a dataframe

In [7]:
df_test = pd.DataFrame()

In [8]:
#Create a mixed type column

In [9]:
df_test['mix'] = ['a', 'b', 1, True]

In [10]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [11]:
# Check for any mixed-type columns
for col in df_test.columns.tolist():
    weird = (df_test[col].map(type) != df_test[col].iloc[0].__class__).any()
    if weird:
        print(col)

mix


In [12]:
# Convert column's data from numeric to string
df_test['mix'] = df_test['mix'].astype('str')

In [13]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   mix     4 non-null      object
dtypes: object(1)
memory usage: 164.0+ bytes


In [14]:
df_test['mix'].dtype

dtype('O')

04. Missing Values

In [15]:
# Find missing values (takes sum of Trues in a  column)
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [16]:
# Create subset of the dataframe that contains the nulls
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [17]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [20]:
# Addressing Missing Values
There a few ways to deal with missing data:

1. Create a new variable that acts like a flag based on the missing value.
2. Impute the value with the mean or median of the column (if the variable is numeric).
3. Remove or filter out the missing data.

In [21]:
df_nan.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,16.0,16.0,16.0,16.0
mean,6684.0,89.9375,10.9375,13.0125
std,12836.665242,33.731229,4.639953,3.881731
min,34.0,26.0,1.0,1.2
25%,459.25,70.75,7.75,12.175
50%,2413.0,98.5,11.5,13.65
75%,3872.75,120.0,14.5,14.425
max,40440.0,126.0,16.0,20.9


In [22]:
# Ex: When using the mean, df['column with missings'].fillna(mean value, inplace=True)

In [23]:
df_nan.median()

product_id       2413.0
product_name        NaN
aisle_id           98.5
department_id      11.5
prices            13.65
dtype: object

In [24]:
# EX: when using the median, df['column with missings'].fillna(median value, inplace=True)

In [25]:
# Because the missing values are strings there's not much you can do other than remove/filter the data

In [26]:
# Find shape of df
df_prods.shape

(49693, 5)

In [27]:
# Create new df without the nulls
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [28]:
# Should have exactly 16 less rows
df_prods_clean.shape

(49677, 5)

In [29]:
#To drop all missing values --> df_prods.dropna(inplace = True)

In [31]:
#To drop only the NaNs from a particular column --> df_prods.dropna(subset = [‘product_name’], inplace = True)

05. Duplicates

In [32]:
# Create subset of df_prods_clean that contains only rows of duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [33]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [34]:
# Check current number of rowns in df_prods_clean
df_prods_clean.shape

(49677, 5)

In [35]:
# Create datafram that doesn't include the duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [36]:
df_prods_clean_no_dups.shape

(49672, 5)

06. Exporting Changes

In [37]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# 4.5 Task

## Step 2

In [40]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


'days_since prior order': Has a count less than all other rows inndicating missing values. It also has a measure of zero indicating that some orders could be places within the same day. The max value is 30, which may indicate that orders are capped at 30 days or 1 month.
'order_number': Has a max value of 100, which may mean orders are capped at 100 items.
'order_day of the week': Has min value of 0 and max value of 6 meaning the seven numbers (0-6) represent days of the week.
'order_hour_of_day': Has a min value of 0 and max value of 23 meaning the twenty-four numbers (0-23) represent 24 hrs of the day.

## Step 3

In [42]:
for col in df_ords.columns.tolist():
    weird = (df_ords[col].map(type) != df_ords[col].iloc[0].__class__).any()
    if weird:
        print(col)

Unnamed: 0
order_id
user_id
order_number
orders_day_of_week
order_hour_of_day
days_since_prior_order


In [43]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   order_id                int64  
 2   user_id                 int64  
 3   order_number            int64  
 4   orders_day_of_week      int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(6)
memory usage: 182.7 MB


## Step 5

In [44]:
# Find missing values (takes sum of Trues in a  column)
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

As suspected, the variable that has missing values is the 'days_since_prior_order' which has 206209 values missing.

## Step 6

In [61]:
# In order to determine what course of action to take regarding the missing values, we must separate the data from the original dataframe
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [65]:
df_ords_nan

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


## Step 7

In [49]:
# Create subset of df_ords that contains only rows of duplicates
df_ords_dups = df_ords[df_ords.duplicated()]

In [50]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


There are no duplicates. There are no duplicates in this set because it returned empty. however, if there were duplicates I would execute '.drop_duplicates()' to creat a new datafreame without duplicates.

## Step 9

In [51]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))