### 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import time

start_time = time.time()

### 2. Create paths to folders

In [2]:
# Create path to Prepared data folder
path_prepared = r'C:\Users\danie\Desktop\career Foundry\31-03-2023 Instacart Basket Analysis\02 Data\Prepared data'

In [3]:
# Create path to 'Visualizations' folder
path_visualizations = r'C:\Users\danie\Desktop\career Foundry\31-03-2023 Instacart Basket Analysis\04 Analysis\Visualizations'

In [4]:
# Create path to Original data folder
path_original = r'C:\Users\danie\Desktop\career Foundry\31-03-2023 Instacart Basket Analysis\02 Data\Original data'

### 3. Import Customer Data

In [5]:
# Import .csv file
df_customer = pd.read_csv(os.path.join(path_original, 'customers.csv'))

In [6]:
df_customer.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [7]:
df_customer.shape

(206209, 10)

### 4. Wrangle the data

#### Rename the columns

In [8]:
#Reformating the columns using '.str.lower()'
df_customer.columns = df_customer.columns.str.lower()

In [9]:
#Renaming columns into self-descriptive and suitable terms
df_customer.rename(columns = {'first name' : 'first_name', 'n_dependants' : 'number_of_kids', 'surnam' : 'surname', 'fam_status' : 'family_status'}, inplace = True)

## 5. Data quality and consistency check

In [10]:
# Find missing values
df_customer.isnull().sum()

user_id               0
first_name        11259
surname               0
gender                0
state                 0
age                   0
date_joined           0
number_of_kids        0
family_status         0
income                0
dtype: int64

***NULL values found in 'first_name' column. It doesn't influence our analysis.***

In [11]:
#Looking for full duplicates in a dataframe
df_dups = df_customer[df_customer.duplicated()]

In [12]:
df_dups

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,number_of_kids,family_status,income


***No full duplicates found in our data set***

In [13]:
#Searching for odd values using '.describe()
df_customer.describe()

Unnamed: 0,user_id,age,number_of_kids,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [14]:
#Check for mixed types in df_customer
for col in df_customer.columns.tolist():
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customer[weird]) > 0:
    print (col)

first_name


***Mixed values found in column 'first_name'***

In [15]:
#Fix 'Mixed types' in column 'first_name' 
df_customer['first_name'] = df_customer['first_name'].astype('str')

In [16]:
#Check for mixed types in df_customer
for col in df_customer.columns.tolist():
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customer[weird]) > 0:
    print (col)

### 5. Import Data frame And Merge

In [17]:
#Import orders_products_merged.pkl file for merge
df_ords_prods = pd.read_pickle(os.path.join(path_prepared, 'orders_products_merged.pkl'))

In [18]:
#Merge df_ords_prods & df_customer
df_completed_instacart = df_ords_prods.merge(df_customer, on = 'user_id', indicator = True)

### 6. Merged Data frame Integrity and Consistency check

In [19]:
df_completed_instacart.shape

(32399732, 33)

In [20]:
df_completed_instacart.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_the_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'loyalty_flag', 'max_orders',
       'average_spend', 'spender_flag', 'user_median', 'frequency_flag',
       'first_name', 'surname', 'gender', 'state', 'age', 'date_joined',
       'number_of_kids', 'family_status', 'income', '_merge'],
      dtype='object')

***Index follows a logical and self_descriptive format***

In [21]:
#Merge check
df_completed_instacart.groupby('_merge').agg({'user_id': ['count']})

Unnamed: 0_level_0,user_id
Unnamed: 0_level_1,count
_merge,Unnamed: 1_level_2
left_only,0
right_only,0
both,32399732


***Complete merge between both Data sets***

In [22]:
#Check for mixed types in df_customer
for col in df_completed_instacart.columns.tolist():
  weird = (df_completed_instacart[[col]].applymap(type) != df_completed_instacart[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_completed_instacart[weird]) > 0:
    print (col)

***No mixed data found within the same column in the new dataframe***

In [23]:
# Find missing values
df_completed_instacart.isnull().sum()

order_id                  0
user_id                   0
order_number              0
orders_day_of_the_week    0
order_hour_of_day         0
days_since_prior_order    0
product_id                0
add_to_cart_order         0
reordered                 0
product_name              0
aisle_id                  0
department_id             0
prices                    0
price_range_loc           0
busiest_day               0
busiest_days              0
busiest_period_of_day     0
loyalty_flag              0
max_orders                0
average_spend             0
spender_flag              0
user_median               0
frequency_flag            0
first_name                0
surname                   0
gender                    0
state                     0
age                       0
date_joined               0
number_of_kids            0
family_status             0
income                    0
_merge                    0
dtype: int64

***'0' missing or NULL values found in df_complete_instacart Data set***

In [24]:
df_completed_instacart.drop(columns = ['_merge'], inplace = True)

### 7. Export the new Data Frame

In [25]:
#Data set 'instacart_data_set.pkl'
df_completed_instacart.to_pickle(os.path.join(path_prepared, 'instacart_data_set.pkl'))

In [26]:
print([time.time() - start_time, ' seconds---'])

[210.27458453178406, ' seconds---']
