# 5.5 Data Consistency Checks

### This script contains the following points:

### 1. Mixed-Type Data
### 2. Missing Values
### 3. Duplicates
### 4. Tidying Up and Exporting Changes

In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import os

In [2]:
#create folder path
path = r'C:\Users\Thor\OneDrive\Desktop\Career Foundry Tools\Projects\Project 5\09-01-2021 Instacart Basket Analysis'

In [3]:
#loading products.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
#loading orders.csv
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [6]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,260.4052,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,18496.1,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,-32768.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,-15537.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,1261.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,15616.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,32767.0,100.0,6.0,23.0,30.0


In [7]:
df_ords = df_ords.drop(columns = ['Unnamed: 0'])

In [8]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,260.4052,17.15486,2.776219,13.45202,11.11484
std,987581.7,18496.1,17.73316,2.046829,4.226088,9.206737
min,1.0,-32768.0,1.0,0.0,0.0,0.0
25%,855271.5,-15537.0,5.0,1.0,10.0,4.0
50%,1710542.0,1261.0,11.0,3.0,13.0,7.0
75%,2565812.0,15616.0,23.0,5.0,16.0,15.0
max,3421083.0,32767.0,100.0,6.0,23.0,30.0


# Mixed-Type Data

In [9]:
#Create a dataframe
df_test = pd.DataFrame()

In [10]:
# Create a mixed type column 
df_test['mix'] = ['a', 'b', 1, True]

In [11]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [14]:
df_test.dtypes

mix    object
dtype: object

In [15]:
# Check for mixed type data
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [19]:
#changed mixed data to string
df_test['mix'] = df_test['mix'].astype('str')

# Missing Values

In [25]:
# Finding missing values using .isnull() and sum()
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [26]:
# Create a subset of missing values for product name
df_nan = df_prods[df_prods['product_name'].isnull()==True]

In [27]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [29]:
df_prods.shape

(49693, 5)

In [30]:
# Create a new df_prods but remove rows with missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull()==False]

In [31]:
# Check to make sure missing rows were removed
df_prods_clean.shape

(49677, 5)

# Duplicates

In [32]:
# Looking for full duplicates in dataframe
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [33]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [34]:
df_prods_clean.shape

(49677, 5)

In [35]:
# Remove duplicate rows that were found using drop_duplicates() command
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [36]:
df_prods_clean_no_dups.shape

(49672, 5)

# Tidying Up and Exporting Changes

In [37]:
#saving df_prods file that has been cleaned and checked
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# Task 5.5

## Question 3: Check for mixed-type data in your df_ords dataframe

In [42]:
df_ords.dtypes

order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_time_of_day           int64
days_since_prior_order    float64
dtype: object

### No mixed type data

## Question 5: Run a check for missing values in your df_ords dataframe. In a markdown cell, report your findings and propose an explanation for any missing values you find.

In [43]:
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_time_of_day              0
days_since_prior_order    206209
dtype: int64

### There are 206,209 missing values in the days since prior order column, which is probably due to customers who are placing orders for the first time. 

## Question 6: Address the missing values using an appropriate method. In a markdown cell, explain why you used your method of choice.

In [44]:
# Create new dataframe excluding data with missing values
df_ords_clean = df_ords[df_ords['days_since_prior_order'].isnull() == False]

In [45]:
df_ords_clean.shape

(3214874, 6)

In [46]:
df_ords.shape

(3421083, 6)

In [48]:
# Check to see if number of values removed from orders matches the number of null values
3421083-3214874

206209

### I chose to filter out the missing data by creating a new dataframe that excluded them. This project is focused around targeting different customers based on their purchasing behaviors, which requires data about repeat customers. In this particular case, data about new customers isn't important. Because data about new customers only made up 6% of all data, I thought it made the most sense to filter it out.

## Question 7: Run a check for duplicate values in your df_ords data. In a markdown cell, report your findings and propose an explanation for any duplicate values you find.

In [49]:
# Looking for full duplicates in dataframe
df_ords_dups_clean = df_ords_clean[df_ords_clean.duplicated()]

In [50]:
df_ords_dups_clean

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order


### No duplicates found

In [51]:
df_ords_clean.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))