# Import libraries and load the customer data

In [4]:
# Import required libraries
import pandas as pd
import os

path = '/Users/dela/Documents/15-01-2025 Instacart Basket Analysis/02 Data/Original Data'

# Load the customer data
customer_data = pd.read_csv(os.path.join(path, 'customers.csv'))

# Display the first few rows
print(customer_data.head())


   user_id First Name    Surnam  Gender       STATE  Age date_joined  \
0    26711    Deborah  Esquivel  Female    Missouri   48    1/1/2017   
1    33890   Patricia      Hart  Female  New Mexico   36    1/1/2017   
2    65803    Kenneth    Farley    Male       Idaho   35    1/1/2017   
3   125935   Michelle     Hicks  Female        Iowa   40    1/1/2017   
4   130797        Ann   Gilmore  Female    Maryland   26    1/1/2017   

   n_dependants fam_status  income  
0             3    married  165665  
1             0     single   59285  
2             2    married   99568  
3             0     single   42049  
4             1    married   40374  


In [6]:
# Wrangle the data
print(customer_data.isnull().sum())


user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64


# Drop the column First Name

In [8]:
customer_data.drop(columns=['First Name'], inplace=True)


In [10]:
print(customer_data.isnull().sum())  # Confirm no remaining missing values
print(customer_data.columns)        # Confirm the column is dropped


user_id         0
Surnam          0
Gender          0
STATE           0
Age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64
Index(['user_id', 'Surnam', 'Gender', 'STATE', 'Age', 'date_joined',
       'n_dependants', 'fam_status', 'income'],
      dtype='object')


In [12]:
# Check for duplicates
duplicates = customer_data.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")


Number of duplicate rows: 0


In [14]:
# Check data types
print(customer_data.dtypes)


user_id          int64
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object


In [16]:
print(customer_data.head())  # Check first few rows
print(customer_data.info()) # Review data summary


   user_id    Surnam  Gender       STATE  Age date_joined  n_dependants  \
0    26711  Esquivel  Female    Missouri   48    1/1/2017             3   
1    33890      Hart  Female  New Mexico   36    1/1/2017             0   
2    65803    Farley    Male       Idaho   35    1/1/2017             2   
3   125935     Hicks  Female        Iowa   40    1/1/2017             0   
4   130797   Gilmore  Female    Maryland   26    1/1/2017             1   

  fam_status  income  
0    married  165665  
1     single   59285  
2    married   99568  
3     single   42049  
4    married   40374  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   Surnam        206209 non-null  object
 2   Gender        206209 non-null  object
 3   STATE         206209 non-null  object
 4   Age           206209 non-null  int64 


In [18]:
customer_data['date_joined'] = pd.to_datetime(customer_data['date_joined'], errors='coerce')


In [20]:
customer_data['STATE'] = customer_data['STATE'].astype('category')
customer_data['Gender'] = customer_data['Gender'].astype('category')
customer_data['fam_status'] = customer_data['fam_status'].astype('category')


In [22]:
customer_data.rename(columns={'Surnam': 'Surname'}, inplace=True)
customer_data['Surname'] = customer_data['Surname'].astype('object')


In [24]:
print(customer_data.dtypes)


user_id                  int64
Surname                 object
Gender                category
STATE                 category
Age                      int64
date_joined     datetime64[ns]
n_dependants             int64
fam_status            category
income                   int64
dtype: object


# Recheck for missing values

In [26]:
print(customer_data.isnull().sum())


user_id         0
Surname         0
Gender          0
STATE           0
Age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64


In [28]:
# Validate categorical data
print(customer_data['Gender'].value_counts())
print(customer_data['STATE'].value_counts())
print(customer_data['fam_status'].value_counts())


Gender
Male      104067
Female    102142
Name: count, dtype: int64
STATE
Alabama                 4044
District of Columbia    4044
Iowa                    4044
Indiana                 4044
Illinois                4044
Idaho                   4044
Georgia                 4044
Florida                 4044
Hawaii                  4044
Delaware                4044
Connecticut             4044
Colorado                4044
California              4044
Arkansas                4044
Arizona                 4044
Alaska                  4044
South Dakota            4043
Ohio                    4043
Oklahoma                4043
Oregon                  4043
Pennsylvania            4043
Rhode Island            4043
South Carolina          4043
Wisconsin               4043
Tennessee               4043
Texas                   4043
Utah                    4043
Vermont                 4043
Virginia                4043
Washington              4043
West Virginia           4043
North Carolina          4043

In [30]:
# Check date_joined column
print(customer_data['date_joined'].min(), customer_data['date_joined'].max())


2017-01-01 00:00:00 2020-04-01 00:00:00


# Save the Cleaned Data

In [32]:
import os

# Set the export path
path = '/Users/dela/Documents/15-01-2025 Instacart Basket Analysis/02 Data/Prepared Data'

# Save the cleaned data as a pickle file
customer_data.to_pickle(os.path.join(path, 'customer_data_cleaned.pkl'))


# Combine Data

In [34]:
# Load Instacart data
instacart_data = pd.read_pickle(os.path.join(path, 'ords_prods_merge_final.pkl'))

# Merge datasets
combined_data = instacart_data.merge(customer_data, on='user_id', how='left')

# Save the combined data
combined_data.to_pickle(os.path.join(path, 'instacart_customer_combined.pkl'))


In [36]:
print(combined_data.info())
print(combined_data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404289 entries, 0 to 32404288
Data columns (total 27 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   order_id                 int64         
 1   user_id                  int64         
 2   order_number             int64         
 3   order_day_of_week        int64         
 4   order_hour_of_day        int64         
 5   days_since_prior_order   float64       
 6   product_id               object        
 7   add_to_cart_order        int64         
 8   reordered                int64         
 9   product_name             object        
 10  aisle_id                 int64         
 11  department_id            int64         
 12  prices                   float64       
 13  max_order                int64         
 14  loyalty_flag             object        
 15  avg_spending             float64       
 16  spending_flag            object        
 17  median_days_since_order  