# Exploratory Data Analysis

This notebook explores the cleaned e‑commerce dataset. It computes descriptive statistics, visualises relationships between variables and annotates observations. Run the cells in order.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure charts render inline
%matplotlib inline

# Load the cleaned dataset
df = pd.read_csv('../data/processed/customers_clean.csv', parse_dates=['purchase_date'])

# Inspect the first few rows
df.head()

In [None]:
# Summary statistics
df.describe(include='all')

In [None]:
# Correlation heatmap
import numpy as np
metrics = ['total_spend','items_purchased','avg_rating','age','days_since_last_purchase']
corr = df[metrics].corr()
plt.figure(figsize=(6,4))
im = plt.imshow(corr, interpolation='nearest')
plt.colorbar(im)
plt.xticks(range(len(metrics)), metrics, rotation=45, ha='right')
plt.yticks(range(len(metrics)), metrics)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# Average spend by membership type
avg_spend = df.groupby('membership_type')['total_spend'].mean().sort_index()
avg_spend.plot(kind='bar')
plt.ylabel('Average Spend')
plt.title('Average Spend by Membership Type')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

Gold members spend roughly 2–3× more than Bronze members. This is visible from the bar chart above.

In [None]:
# Age vs total spend coloured by membership type
plt.figure()
for tier, group in df.groupby('membership_type'):
    plt.scatter(group['age'], group['total_spend'], alpha=0.6, label=tier)
plt.xlabel('Age')
plt.ylabel('Total Spend')
plt.legend()
plt.title('Age vs Total Spend by Membership Type')
plt.tight_layout()
plt.show()

In [None]:
# Discount effect on spending and rating
avg_spend_discount = df.groupby('discount_applied')['total_spend'].mean()
avg_rating_discount = df.groupby('discount_applied')['avg_rating'].mean()

# Plot average spend by discount status
plt.figure()
avg_spend_discount.plot(kind='bar')
plt.ylabel('Average Spend')
plt.title('Average Spend by Discount Status')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Plot average rating by discount status
plt.figure()
avg_rating_discount.plot(kind='bar')
plt.ylabel('Average Rating')
plt.title('Average Rating by Discount Status')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Distribution of days since last purchase
plt.hist(df['days_since_last_purchase'], bins=20)
plt.title('Distribution of Days Since Last Purchase')
plt.xlabel('Days')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Boxplot of total spend by satisfaction level
levels = df['satisfaction_level'].unique()
data = [df[df['satisfaction_level']==level]['total_spend'] for level in levels]
plt.figure()
plt.boxplot(data, labels=levels)
plt.title('Total Spend by Satisfaction Level')
plt.ylabel('Total Spend')
plt.tight_layout()
plt.show()

The distribution plots highlight that many customers have not purchased recently and that dissatisfaction corresponds with higher variability in spending. Satisfied customers tend to repurchase sooner.