In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# Set path
path = '/Users/josephadamski/Instacart Basket Analysis'

# Import merged data from Part 1
df = pd.read_pickle(os.path.join(path, 'Data', 'Prepared Data', 'ords_prods_customers.pkl'))

# Verify data
print("Dataframe shape:", df.shape)
print("\nColumns:", df.columns.tolist())
df.head()

Matplotlib is building the font cache; this may take a moment.


Dataframe shape: (32404859, 34)

Columns: ['product_id', 'product_name', 'aisle_id', 'department_id', 'prices', 'order_id', 'user_id', 'order_number', 'orders_day_of_week', 'order_hour_of_day', 'days_since_prior_order', 'first_order_flag', 'add_to_cart_order', 'reordered', '_merge', 'price_label', 'busiest_day', 'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag', 'avg_price', 'spending_flag', 'median_days_prior', 'order_frequency_flag', 'First Name', 'Surnam', 'Gender', 'STATE', 'Age', 'date_joined', 'n_dependants', 'fam_status', 'income']


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,order_frequency_flag,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,...,Frequent customer,Deborah,Glass,Female,Vermont,66,6/16/2018,2,married,158302
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,Frequent customer,Heather,Myers,Female,Wisconsin,40,2/9/2020,3,married,31308
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,...,Frequent customer,Heather,Myers,Female,Wisconsin,40,2/9/2020,3,married,31308


In [3]:
# Create Visualizations folder if it doesn't exist
viz_path = os.path.join(path, 'Analysis', 'Visualizations')
os.makedirs(viz_path, exist_ok=True)
print(f"Visualizations will be saved to: {viz_path}")

Visualizations will be saved to: /Users/josephadamski/Instacart Basket Analysis/Analysis/Visualizations


In [4]:
# Create histogram
hour_hist = df['order_hour_of_day'].plot.hist(bins=24, color='steelblue', edgecolor='black')

# Customize
plt.xlabel('Hour of Day')
plt.ylabel('Frequency')
plt.title('Distribution of Orders by Hour of Day')

# Export
hour_hist.figure.savefig(os.path.join(viz_path, 'order_hour_histogram.png'), dpi=300, bbox_inches='tight')
plt.clf()

print("✓ Order hour histogram created and saved")

✓ Order hour histogram created and saved


<Figure size 640x480 with 0 Axes>

### Histogram Analysis: Order Hour Distribution

The histogram reveals a clear pattern in customer ordering behavior. Orders are heavily 
concentrated during daytime hours, with peak activity occurring between 10 AM and 3 PM. 
The highest frequency is around 10-11 AM. There's a dramatic drop in orders during early 
morning hours (midnight to 7 AM), with minimal activity between 2-6 AM. This suggests 
customers primarily shop during traditional waking hours, with mid-morning being the most 
popular time for grocery ordering.

In [5]:
# Create bar chart
loyalty_bar = df['loyalty_flag'].value_counts().plot.bar(color='coral', edgecolor='black')

# Customize
plt.xlabel('Customer Type')
plt.ylabel('Number of Orders')
plt.title('Distribution of Orders by Customer Loyalty')
plt.xticks(rotation=45, ha='right')

# Export
loyalty_bar.figure.savefig(os.path.join(viz_path, 'loyalty_distribution_bar.png'), dpi=300, bbox_inches='tight')
plt.clf()

print("✓ Loyalty bar chart created and saved")

✓ Loyalty bar chart created and saved


<Figure size 640x480 with 0 Axes>

### Customer Loyalty Distribution

Regular customers represent the largest segment with approximately 16 million orders, 
followed by Loyal customers with about 10 million orders, and New customers with roughly 
6 million orders. This distribution indicates that the majority of Instacart's business 
comes from established, returning customers rather than new acquisitions.

In [None]:
# Create representative sample (70/30 split)
np.random.seed(4)
dev = np.random.rand(len(df)) <= 0.7

# Split data
big = df[dev]
small = df[~dev]

# Verify split
print(f"Big sample: {len(big):,} rows")
print(f"Small sample: {len(small):,} rows")
print(f"Total: {len(big) + len(small):,} rows")

# Create subset with needed columns
df_sample = small[['order_hour_of_day', 'prices']]

# Create line chart
price_hour_line = sns.lineplot(data=df_sample, x='order_hour_of_day', y='prices')

# Customize
plt.xlabel('Hour of Day')
plt.ylabel('Average Price ($)')
plt.title('Price Trends by Hour of Day')

# Export
price_hour_line.figure.savefig(os.path.join(viz_path, 'price_by_hour_line.png'), dpi=300, bbox_inches='tight')
plt.clf()

print("✓ Price by hour line chart created and saved")

# Use correct column names: 'Age' and 'n_dependants'
df_sample2 = small[['Age', 'n_dependants']]

# Create line chart
age_dep_line = sns.lineplot(data=df_sample2, x='Age', y='n_dependants')

# Customize
plt.xlabel('Age')
plt.ylabel('Number of Dependents')
plt.title('Relationship Between Age and Number of Dependents')

# Export
age_dep_line.figure.savefig(os.path.join(viz_path, 'age_dependents_line.png'), dpi=300, bbox_inches='tight')
plt.clf()

print("✓ Age vs dependents line chart created and saved")

### Price Trends by Hour of Day

The line chart shows interesting pricing patterns throughout the day. There's a notable 
spike in average prices during the early morning hours (around 2-3 AM), reaching over $14, 
followed by a sharp decline. Prices then stabilize around $11-13 throughout the rest of 
the day with relatively consistent patterns. The high variability (shown by the shaded 
confidence interval) in early morning hours may be due to fewer orders during that time, 
making averages less stable.

In [10]:
# Use correct column names: 'Age' and 'income'
age_income_scatter = sns.scatterplot(data=df, x='Age', y='income', alpha=0.5)

# Customize
plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.title('Relationship Between Age and Income')

# Export
age_income_scatter.figure.savefig(os.path.join(viz_path, 'age_income_scatter.png'), dpi=300, bbox_inches='tight')
plt.clf()

print("✓ Age vs income scatterplot created and saved")

✓ Age vs income scatterplot created and saved


<Figure size 640x480 with 0 Axes>

### Relationship Between Age and Number of Dependents

The line chart shows that the average number of dependents remains relatively stable 
across all age groups, hovering around 1.5 dependents with minor fluctuations. There is 
no clear linear relationship between age and number of dependents. The data suggests that 
dependency counts are fairly consistent regardless of customer age, though there appear 
to be slight peaks around ages 20, 60, and 65, which may correspond to different family 
life stages.

### Relationship Between Age and Income

The scatterplot demonstrates no strong correlation between age and income levels. Income 
appears widely distributed across all age ranges, from approximately $25,000 to $600,000. 
The densest concentration of data points falls in the lower income ranges ($25,000-$300,000) 
across all ages. While there's a slight tendency for higher income values to appear in 
middle age ranges (40-65), the relationship is weak and inconsistent. This suggests that 
age alone is not a strong predictor of income for Instacart customers.