### Create a new notebook, import the necessary analysis and visualization libraries, then import your most up-to-date project data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Define path
path = r'C:\Users\Lukman\OneDrive\Documents\FEMINA CF\Data Immerssion\Achievement4\Instacart Basket Analysis'

In [3]:
# Import most up-to-date project data
instacart = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_merged.pkl'))

###  Provide the Instacart senior stakeholders with descriptive findings about sales. Create a histogram of the “order_hour_of_day” column

In [None]:

plt.figure(figsize=(10, 6))
plt.hist(instacart['order_hour_of_day'], bins=24, edgecolor='black')
plt.title('Histogram of Order Hour of Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Orders')
plt.xticks(range(24))
plt.grid(True)

plt.savefig('order_hour_of_day_histogram.png')
plt.show()

### The marketing team is curious about the distribution of orders among customers in terms of loyalty. Create a bar chart from the “loyalty_flag” column

In [None]:

# Get the counts of each unique value in 'loyalty_flag'
loyalty_flag_counts = instacart['loyalty_flag'].value_counts()

# Plotting the bar plot for 'loyalty_flag'
plt.figure(figsize=(10, 6))
plt.bar(loyalty_flag_counts.index, loyalty_flag_counts.values, edgecolor='black')
plt.title('Loyalty Flag Distribution')
plt.xlabel('Loyalty Flag')
plt.ylabel('Count')
plt.grid(True)

# Save the figure
plt.savefig('loyalty_flag_distribution.png')

# Display the plot
plt.show()

### Check whether there’s a difference in expenditure (the “prices” column) depending on the hour of the day

In [4]:
# Create a random sample by generating a seed
np.random.seed(4)

In [5]:
# Create a dev list based on the data split we want (70/30)
dev = np.random.rand(len(instacart)) <= 0.7

In [6]:
# Split the dataframe into two samples
big = instacart[dev]
small = instacart[~dev]

In [7]:
# Compare the dataframe length to the sum of the big and small samples
len(instacart)

32404859

In [8]:
len(big) + len(small)

32404859

In [9]:
# Reduce the samples to only the columns needed
instacart_2 = small [['order_hour_of_day','prices']]

In [10]:
# Check if the subset of the sample was made properly
instacart_2.head(5)

Unnamed: 0,order_hour_of_day,prices
0,11,5.800781
2,21,5.800781
3,21,9.0
6,17,5.800781
10,20,5.800781


In [None]:
line = sns.lineplot(data=instacart_2, x='order_hour_of_day', y='prices')


### Determine whether there’s a connection between age and family situation by creating a line chart exploring the connections between age and number of dependents

In [None]:
# Reduce previously made sample to only the columns needed
instacart_3 = small [['age','n_dependants']]

In [None]:
# Check if the subset of the sample was made properly
instacart_3.head(5)

In [None]:
# Create a line chart
plt.figure(figsize=(12, 6))
line_2 = sns.lineplot(data = instacart_3, x = 'age',y = 'n_dependants')
line_2.set_title('age_vs_n_dependants')
line_2.set_xlabel('age')
line.set_ylabel('n_dependants')
line.set_xticks(range(24))
line.grid(True)

# Save the figure
plt.savefig('age_vs_n_dependants')

# Display the plot
plt.show()

The line chart shows no specific relation between age and number of dependants.

### Explore whether there’s a connection between age and spending power (income) using a scatterplot

In [None]:
instacart_4 = small [['age','income']]

In [None]:
# Create a scatterplot
scatterplot = sns.scatterplot(x = 'age', y = 'income',data = instacart_4)

### Export your visualizations as “.png” files in your relevant project folder

In [None]:
plt.savefig(os.path.join(path, '04 Analysis','Visualizations', 'hist_order_hour_of_day_frequency.png'))

In [None]:
bar.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'loyalty_flag_distribution.png'))

In [None]:
line.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'change_in_price_over_hours_of_the_day.png'))

In [None]:
line_2.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'age_vs_n_dependants.png'))

In [None]:
scatterplot.figure.savefig(os.path.join(path, '04 Analysis','Visualizations', 'change_in_spending_power_over_age.png'))