# Table of Contents
### 1. Import Librairies
### 2. Import Data
### 3. Clean the customers data set
### 4. Merge the data sets
### 5. Export Data

# 1. Import Librairies

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 2. Import Data

In [2]:
#Create path
path = r'C:\Users\manev\Documents\Agentür für Arbeit\CareerFoundry\Data Immersion\Achievement 4\Instacart Basket Analysis'

In [3]:
#Create data frame for the "ords_prods_merge" data set
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_aggregated.pkl'))

In [4]:
#Create data frame for the "customers" data set
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [5]:
df_customers.shape

(206209, 10)

# 3. Clean the customers data set

In [6]:
#Observe the df_customers dataframe
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [7]:
#Rename columns
df_customers.rename(columns = {'First Name' : 'First_Name', 'Surnam' : 'Last_Name', 'STATE' : 'State', 'n_dependants' : 'Number_of_Dependants','fam_status' : 'Family_Status', 'income' : 'Income'}, inplace = True)

In [8]:
#Check the name changes
df_customers.head()

Unnamed: 0,user_id,First_Name,Last_Name,Gender,State,Age,date_joined,Number_of_Dependants,Family_Status,Income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [9]:
#Export this version of the data set before dropping columns
df_customers.to_csv(os.path.join(path, '02 Data','Prepared Data', 'customers_renamed.csv'),index=False)

In [10]:
#Dropping columns that are not needed for the analysis
customers = df_customers.drop(columns = ['First_Name','Last_Name'])

In [11]:
customers.head()

Unnamed: 0,user_id,Gender,State,Age,date_joined,Number_of_Dependants,Family_Status,Income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


In [12]:
#Checking for missing values
customers.isnull().sum()

user_id                 0
Gender                  0
State                   0
Age                     0
date_joined             0
Number_of_Dependants    0
Family_Status           0
Income                  0
dtype: int64

In [13]:
#Looking for duplicates
df_dups = customers[customers.duplicated()]

In [14]:
df_dups.head()

Unnamed: 0,user_id,Gender,State,Age,date_joined,Number_of_Dependants,Family_Status,Income


In [15]:
customers.shape

(206209, 8)

In [16]:
customers.drop_duplicates()

Unnamed: 0,user_id,Gender,State,Age,date_joined,Number_of_Dependants,Family_Status,Income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...
206204,168073,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Female,California,27,4/1/2020,1,married,99799


In [17]:
customers.shape

(206209, 8)

In [18]:
#Checking for mixed-type data
for col in customers.columns.tolist():
  weird = (customers[[col]].map(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers[weird]) > 0:
    print (col)

There is no mixed-data in the dataframe

# 4. Merge the data sets

In [19]:
#Observe the data set
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,average_item_price_per_customer,spending_flag,median_days_in_between_orders,frequency_flag
0,2539329,1,1,2,8,,196,1,0,Soda,...,Mid-range product,Regularly busy,Regularly busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [20]:
#Check the shape of the ords_prods_merge dataframe
ords_prods_merge.shape

(32404859, 23)

In [21]:
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 23 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   order_id                         int64  
 1   user_id                          int64  
 2   order_number                     int64  
 3   orders_day_of_week               int64  
 4   order_hour_of_day                int64  
 5   days_since_prior_order           float64
 6   product_id                       int64  
 7   add_to_cart_order                int64  
 8   reordered                        int64  
 9   product_name                     object 
 10  aisle_id                         int64  
 11  department_id                    int64  
 12  prices                           float64
 13  price_range_loc                  object 
 14  busiest_day                      object 
 15  busiest_days                     object 
 16  busiest_period_of_day            object 
 17  max_or

In [22]:
## reducing the size of ords_prods_merge via downcasting
ords_prods_merge['order_id'] = ords_prods_merge['order_id'].astype('int32')
ords_prods_merge['user_id'] = ords_prods_merge['user_id'].astype('int32')
ords_prods_merge['order_number'] = ords_prods_merge['order_number'].astype('int8')
ords_prods_merge['orders_day_of_week'] = ords_prods_merge['orders_day_of_week'].astype('int8')
ords_prods_merge['order_hour_of_day'] = ords_prods_merge['order_hour_of_day'].astype('int8')
ords_prods_merge['days_since_prior_order'] = ords_prods_merge['days_since_prior_order'].astype('float32')
ords_prods_merge['product_id'] = ords_prods_merge['product_id'].astype('int32')
ords_prods_merge['add_to_cart_order'] = ords_prods_merge['add_to_cart_order'].astype('int16')
ords_prods_merge['reordered'] = ords_prods_merge['reordered'].astype('int8')
ords_prods_merge['aisle_id'] = ords_prods_merge['aisle_id'].astype('int16')
ords_prods_merge['department_id'] = ords_prods_merge['department_id'].astype('int8')
ords_prods_merge['prices'] = ords_prods_merge['prices'].astype('float32')
ords_prods_merge['max_order'] = ords_prods_merge['max_order'].astype('int8')

In [23]:
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 23 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   order_id                         int32  
 1   user_id                          int32  
 2   order_number                     int8   
 3   orders_day_of_week               int8   
 4   order_hour_of_day                int8   
 5   days_since_prior_order           float32
 6   product_id                       int32  
 7   add_to_cart_order                int16  
 8   reordered                        int8   
 9   product_name                     object 
 10  aisle_id                         int16  
 11  department_id                    int8   
 12  prices                           float32
 13  price_range_loc                  object 
 14  busiest_day                      object 
 15  busiest_days                     object 
 16  busiest_period_of_day            object 
 17  max_or

In [24]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   user_id               206209 non-null  int64 
 1   Gender                206209 non-null  object
 2   State                 206209 non-null  object
 3   Age                   206209 non-null  int64 
 4   date_joined           206209 non-null  object
 5   Number_of_Dependants  206209 non-null  int64 
 6   Family_Status         206209 non-null  object
 7   Income                206209 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 12.6+ MB


In [25]:
## reducing the size of customers via downcasting
customers['user_id'] = customers['user_id'].astype('int32')
customers['Age'] = customers['Age'].astype('int8')
customers['Number_of_Dependants'] = customers['Number_of_Dependants'].astype('int8')
customers['Income'] = customers['Income'].astype('int32')

In [26]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   user_id               206209 non-null  int32 
 1   Gender                206209 non-null  object
 2   State                 206209 non-null  object
 3   Age                   206209 non-null  int8  
 4   date_joined           206209 non-null  object
 5   Number_of_Dependants  206209 non-null  int8  
 6   Family_Status         206209 non-null  object
 7   Income                206209 non-null  int32 
dtypes: int32(2), int8(2), object(4)
memory usage: 8.3+ MB


In [27]:
customers['user_id'].dtype

dtype('int32')

In [28]:
ords_prods_merge['user_id'].dtype

dtype('int32')

In [29]:
df_ords_prods_cust_merge = ords_prods_merge.merge(customers, on = 'user_id', indicator = True)

In [30]:
df_ords_prods_cust_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,median_days_in_between_orders,frequency_flag,Gender,State,Age,date_joined,Number_of_Dependants,Family_Status,Income,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both


In [31]:
df_ords_prods_cust_merge['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [32]:
df_ords_prods_cust_merge.shape

(32404859, 31)

This new dataframe has the same number of rows as the ords_prods_merge dataframe, so there was a full merge.

In [33]:
#Dropping the "_merge" column as it is not necessary
df_ords_prods_cust_merge =df_ords_prods_cust_merge.drop(columns = ['_merge'])

In [34]:
#Checking that the '_merge" clumn has been dropped
df_ords_prods_cust_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,spending_flag,median_days_in_between_orders,frequency_flag,Gender,State,Age,date_joined,Number_of_Dependants,Family_Status,Income
0,2539329,1,1,2,8,,196,1,0,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423


In [35]:
#Checking the shape of the final data set, so it can be used as reference when importing this data set in another notebook
df_ords_prods_cust_merge.shape

(32404859, 30)

# 5. Export Data

In [36]:
#Export the dataframe as a pickle file 
df_ords_prods_cust_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_cust_merge.pkl'))