## Data Visualization (B)

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Change the format of the values visualization

pd.options.display.float_format = '{:.2f}'.format

In [3]:
# Reset the maximum number of columns to display to default (all columns)

pd.set_option('display.max_columns', None)

In [5]:
# Importing project data

path = r'C:\Users\efens\cf_tasks\2023-07 Instacard Basket Analysis'
customers_raw = pd.read_csv(os.path.join(path, '02 Data', '021 Original Data', 'customers.csv'))

## 01. Wrangling the data and completing the data quality check

In [9]:
# Exploring the data set

customers_raw.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [55]:
customers_raw.shape

(206209, 10)

In [56]:
customers_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   first_name    194950 non-null  object
 2   last_name     206209 non-null  object
 3   gender        206209 non-null  object
 4   state         206209 non-null  object
 5   age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [10]:
customers_raw.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.5,1.5,94632.85
std,59527.56,18.48,1.12,42473.79
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [16]:
# Ranaming columns

customers_raw.rename(columns = {'First Name' : 'first_name'}, inplace = True)
customers_raw.rename(columns = {'Surnam' : 'last_name'}, inplace = True)
customers_raw.rename(columns = {'STATE' : 'state'}, inplace = True)
customers_raw.rename(columns = {'Age' : 'age'}, inplace = True)
customers_raw.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [17]:
# Testing changes
customers_raw.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [25]:
# Checking missing values

missing_values = customers_raw.isnull().sum()
missing_values

user_id             0
first_name      11259
last_name           0
gender              0
state               0
age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [24]:
# Using iloc to get the values where 'first_name' is null

null_first_names_df = customers_raw.loc[customers_raw['first_name'].isnull()]

# Display the 'user_id', 'last_name', and 'first_name' columns for the filtered rows

result_table = null_first_names_df[['user_id', 'last_name', 'first_name']]

print(result_table)

        user_id last_name first_name
53        76659   Gilbert        NaN
73        13738     Frost        NaN
82        89996    Dawson        NaN
99        96166   Oconnor        NaN
105       29778    Dawson        NaN
...         ...       ...        ...
206038   121317    Melton        NaN
206044   200799  Copeland        NaN
206090   167394     Frost        NaN
206162   187532     Floyd        NaN
206171   116898   Delgado        NaN

[11259 rows x 3 columns]


- The number of missing values by customer name is 11259. 
- But if user_id and last_name continue to determine the uniqueness of the client, even with the lost information, i is possible continue working with the data frame without taking any additional actions.

In [26]:
# Looking for full duplicates

df_dups = customers_raw[customers_raw.duplicated()]
df_dups

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income


- No duplicates found.

In [28]:
# Looking for mixed-type data

for col in customers_raw.columns:
  mix_test = (customers_raw[[col]].applymap(type) != customers_raw[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers_raw[mix_test]) > 0:
    print (col)

first_name


In [30]:
# Changing the data type in the column 'first_name' to string

customers_raw['first_name'].astype('str')

0          Deborah
1         Patricia
2          Kenneth
3         Michelle
4              Ann
            ...   
206204        Lisa
206205      Jeremy
206206       Doris
206207        Rose
206208     Cynthia
Name: first_name, Length: 206209, dtype: object

In [31]:
# Checking the data type in the column 'first name'

customers_raw['first_name'].dtype

dtype('O')

#### Are there any columns that do not add anything to further analysis?

In [32]:
# Looking at the data

customers_raw.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [35]:
# Explore the column 'date_joined'

customers_raw['date_joined'].value_counts()

9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
             ... 
9/1/2018      141
1/22/2018     140
11/24/2017    139
7/18/2019     138
8/6/2018      128
Name: date_joined, Length: 1187, dtype: int64

In [36]:
customers_raw['date_joined'].describe()

count        206209
unique         1187
top       9/17/2018
freq            213
Name: date_joined, dtype: object

#### Conclusion

- For ethical reasons, clients' first and last names will not be included in the data analysis. It is assumed that these names will be used for personalized mailings of information to clients; hence, there is no necessity to remove them from the table. However, it is noted that the 'first_name' column contains a large number of empty values. Therefore, to ensure data cleanliness, it can be removed from this dataset.

In [39]:
# Removing the 'first_name' column

customers_raw_new = customers_raw.drop('first_name', axis=1)

# Checking the changes

customers_raw_new.head()

Unnamed: 0,user_id,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


## 02. Combining customer data with the latest Instacart data

In [37]:
# Import Instacart data

ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', '022 Prepared Data', 'ords_prods_merge_agg.pkl'))

In [57]:
# Checking the data

ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spenders,median_days,frequency_flag
0,2539329,1,1,2,8,7.0,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busiest days,Most orders,10,New customer,6.37,Low spender,20.0,Regular customer


In [59]:
# Dropping the column '_merge'

df_ords_prods_merge = ords_prods_merge.drop(columns = ['_merge'])

#Checking the changes
df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spenders,median_days,frequency_flag
0,2539329,1,1,2,8,7.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Most orders,10,New customer,6.37,Low spender,20.0,Regular customer


In [64]:
# Checking the key column data type in the ords_prods_merge dataset

df_ords_prods_merge['user_id'].dtype

dtype('int64')

In [61]:
# Checking the key column data type in the customers dataset

customers_raw_new['user_id'].dtype

dtype('int64')

In [62]:
# Checking the shape

customers_raw_new.shape

(206209, 9)

In [65]:
# Checking if there is the same amount of user_id in the both datasets

df_ords_prods_merge['user_id'].value_counts()

201268    3704
129928    3637
164055    3061
186704    2936
176478    2921
          ... 
188345       3
70320        3
203875       2
124615       2
91567        1
Name: user_id, Length: 206209, dtype: int64

In [66]:
# Merging the dfs

df_merged_all = df_ords_prods_merge.merge(customers_raw_new, on = 'user_id', indicator = True)
df_merged_all.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spenders,median_days,frequency_flag,last_name,gender,state,age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,7.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Most orders,10,New customer,6.37,Low spender,20.0,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [68]:
# Checking the shape

df_merged_all.shape

(32404859, 32)

## 03. Additional testing

In [69]:
# Outer merging the dfs

df_merged_all_outer = df_ords_prods_merge.merge(customers_raw_new, on = 'user_id', how = 'outer', indicator = True)
df_merged_all_outer['_merge'].value_counts() 

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

--> Full match

## 04. Exporting the new data frame

In [70]:
# Export changed df 

df_merged_all.to_pickle(os.path.join(path, '02 Data','022 Prepared Data', 'merged_all.pkl'))