# 01. Importing libraries

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import os

# 02 Importing data

In [4]:
# Establish path
path = r'C:\Users\elena\Documents\02.2025 Instacart Basket Analysis'

In [7]:
# Import Orders datafile
df = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'customers.csv'), index_col = False)

In [9]:
# Checking the data imported correctly
df.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [11]:
# Import the big merged datafile
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merge_4.8.pkl'))

# 03 Correcting columns in the customers dataframe

In [13]:
df.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

'First Name' column is named differently from others and the two separate words might cause problems. It should be renamed. 'Surnam' column should also be renamed, the shortened name is not convenient.

In [17]:
# Renaming 'First Name' column
df.rename(columns = {'First Name' : 'name'}, inplace = True)

In [19]:
# Renaming 'Surnam' column
df.rename(columns = {'Surnam' : 'surname'}, inplace = True)

In [21]:
# Rename other columns, so they are all named in the same style
df.rename(columns = {'Gender' : 'gender'}, inplace = True)
df.rename(columns = {'STATE' : 'state'}, inplace = True)
df.rename(columns = {'Age' : 'age'}, inplace = True)

In [23]:
# Rename fam_status as marital_status
df.rename(columns = {'fam_status' : 'marital_status'}, inplace = True)

In [25]:
df.dtypes

user_id            int64
name              object
surname           object
gender            object
state             object
age                int64
date_joined       object
n_dependants       int64
marital_status    object
income             int64
dtype: object

Columns are named uniformly and logically. No cloumns need to be dropped at this time. Next step is to check for variables with mixed data type.

# 04 Data type of variables in customers dataframe

There is no need for calucating statistics for user_id, it is not a continuous variable. Thus it is best to convert it to a string variable.

In [30]:
# Changing user_id into string variable
df['user_id'] = df['user_id'].astype('str')

In [32]:
df['user_id'].dtype

dtype('O')

In [34]:
# Checking for columns with mixed data type
for col in df.columns.tolist():
  weird = (df[[col]].map(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

name


The 'name' column has mixed data type, which needs to be corrected.

In [37]:
# Changing all values in 'name' column to string data type
df['name'] = df['name'].astype('str')

In [39]:
df['name'].dtype

dtype('O')

# 05 Missing values in customers dataframe 

In [42]:
# Descriptive statistics for customers data to get an idea of the variables
df.describe()

Unnamed: 0,age,n_dependants,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


All the values make logical sense, there are no obvious abnormalities. Count being the same for all numeric variables suggests there are no missing values in these variables. 

In [45]:
# Checking for missing values in customers dataframe
df.isnull().sum()

user_id           0
name              0
surname           0
gender            0
state             0
age               0
date_joined       0
n_dependants      0
marital_status    0
income            0
dtype: int64

There are no missing values in any of the variables

# 06 Duplicates in the customers dataframe

In [49]:
# Creating a separate dataframe for duplicate values
df_dups = df[df.duplicated()]

In [51]:
df_dups

Unnamed: 0,user_id,name,surname,gender,state,age,date_joined,n_dependants,marital_status,income


No duplicates present in the customers dataframe

# 07 Merging customers dataframe with the orders-products combined datafile

In [65]:
ords_prods_merge.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_name,...,_merge,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,prior_order_days_median,frequentcustomer_flag
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,...,both,Mid-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,...,both,Mid-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,...,both,Low-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,...,both,Mid-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,...,both,Mid-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer


In [55]:
ords_prods_merge['user_id'].dtype

dtype('int64')

Data type of 'user_id' needs to be converted to string, so it is the same datatype as in customers dataframe

In [58]:
# Changing user_id into string variable
ords_prods_merge['user_id'] = ords_prods_merge['user_id'].astype('str')

In [59]:
ords_prods_merge['user_id'].dtype

dtype('O')

In [77]:
df.head()

Unnamed: 0,user_id,name,surname,gender,state,age,date_joined,n_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [62]:
# Check the shape of the dataframes to be combined
ords_prods_merge.shape

(32399732, 23)

In [64]:
df.shape

(206209, 10)

In [66]:
# Drop the existing '_merge' column from the previous exercises
ords_prods_merge1 = ords_prods_merge.drop(['_merge'], axis =1)

In [68]:
ords_prods_merge1.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_name,...,prices,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,prior_order_days_median,frequentcustomer_flag
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,...,11.3,Mid-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,...,13.4,Mid-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,...,3.6,Low-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,...,8.4,Mid-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,...,13.7,Mid-range product,Regularly busy,Most orders,8,New customer,8.618889,Low spender,24.0,Non-frequent customer


In [70]:
# Merging the two dataframes
df_merged = ords_prods_merge1.merge(df, on = 'user_id', indicator = True)

In [71]:
#Checking the number of rows results from the inner merge
df_merged['_merge'].value_counts()

_merge
both          32399732
left_only            0
right_only           0
Name: count, dtype: int64

In [72]:
# Checking the merged dataframe
df_merged.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_name,...,name,surname,gender,state,age,date_joined,n_dependants,marital_status,income,_merge
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,...,Paul,Coleman,Male,Idaho,57,2/6/2020,3,married,98119,both
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,...,Paul,Coleman,Male,Idaho,57,2/6/2020,3,married,98119,both
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,...,Paul,Coleman,Male,Idaho,57,2/6/2020,3,married,98119,both
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,...,Paul,Coleman,Male,Idaho,57,2/6/2020,3,married,98119,both
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,...,Paul,Coleman,Male,Idaho,57,2/6/2020,3,married,98119,both


# 08 Exporting the merged dataframe as a pickle file

In [None]:
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merge4.9.pkl'))