# Advanced Assignment: E-commerce Data Analysis Project

This notebook contains the solution to the E-commerce Data Analysis Project, demonstrating advanced Pandas operations in a real-world scenario.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools  # Added import for itertools

# Set random seed for reproducibility
np.random.seed(42)

# Set plot style
plt.style.use('seaborn')

## 1. Data Loading and Cleaning

In [None]:
# Generate sample data
customers = pd.DataFrame({
    'customer_id': range(1, 1001),
    'name': [f'Customer_{i}' for i in range(1, 1001)],
    'email': [f'customer_{i}@example.com' for i in range(1, 1001)],
    'registration_date': pd.date_range(start='2022-01-01', periods=1000)
})

products = pd.DataFrame({
    'product_id': range(1, 101),
    'name': [f'Product_{i}' for i in range(1, 101)],
    'category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home & Garden'], 100),
    'price': np.random.uniform(10, 1000, 100).round(2)
})

orders = pd.DataFrame({
    'order_id': range(1, 10001),
    'customer_id': np.random.choice(customers['customer_id'], 10000),
    'order_date': pd.date_range(start='2022-01-01', end='2023-12-31', periods=10000),
    'product_id': np.random.choice(products['product_id'], 10000),
    'quantity': np.random.randint(1, 10, 10000)
})

website_traffic = pd.DataFrame({
    'date': pd.date_range(start='2022-01-01', end='2023-12-31'),
    'page_views': np.random.randint(1000, 10000, 730),
    'unique_visitors': np.random.randint(500, 5000, 730)
})

# Display sample data
print("Customers:")
print(customers.head())
print("\nProducts:")
print(products.head())
print("\nOrders:")
print(orders.head())
print("\nWebsite Traffic:")
print(website_traffic.head())

# Check for missing values
print("\nMissing values:")
print(customers.isnull().sum())
print(products.isnull().sum())
print(orders.isnull().sum())
print(website_traffic.isnull().sum())

# Convert date columns to datetime
customers['registration_date'] = pd.to_datetime(customers['registration_date'])
orders['order_date'] = pd.to_datetime(orders['order_date'])
website_traffic['date'] = pd.to_datetime(website_traffic['date'])

## 2. Customer Analysis

In [None]:
# Calculate customer lifetime value
order_values = orders.merge(products[['product_id', 'price']], on='product_id')
order_values['total_value'] = order_values['quantity'] * order_values['price']
customer_ltv = order_values.groupby('customer_id')['total_value'].sum().reset_index()
customer_ltv.columns = ['customer_id', 'lifetime_value']

# Identify top 10% of customers by purchase frequency
purchase_frequency = orders.groupby('customer_id').size().reset_index(name='order_count')
top_customers = purchase_frequency.nlargest(n=int(len(purchase_frequency) * 0.1), columns='order_count')

# Analyze customer registration trends
registration_trends = customers.groupby(customers['registration_date'].dt.to_period('M')).size().reset_index(name='new_customers')
registration_trends['registration_date'] = registration_trends['registration_date'].dt.to_timestamp()

# Visualize results
plt.figure(figsize=(12, 4))
plt.plot(registration_trends['registration_date'], registration_trends['new_customers'])
plt.title('Customer Registration Trends')
plt.xlabel('Date')
plt.ylabel('New Customers')
plt.show()

print("Top 10 customers by lifetime value:")
print(customer_ltv.nlargest(10, 'lifetime_value'))

print("\nTop 10 customers by purchase frequency:")
print(top_customers.head(10))

## 3. Product Analysis

In [None]:
# Best-selling products by quantity and revenue
product_sales = orders.merge(products, on='product_id')
product_sales['revenue'] = product_sales['quantity'] * product_sales['price']

best_selling_quantity = product_sales.groupby('name')['quantity'].sum().nlargest(10)
best_selling_revenue = product_sales.groupby('name')['revenue'].sum().nlargest(10)

# Average order value by category
avg_order_value = product_sales.groupby('category')['revenue'].mean().sort_values(ascending=False)

# Market basket analysis (simplified)
def get_product_pairs(group):
    return list(itertools.combinations(group['product_id'].unique(), 2))

product_pairs = orders.groupby('order_id').apply(get_product_pairs).explode().value_counts().nlargest(10)

# Visualize results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
best_selling_quantity.plot(kind='bar', ax=ax1)
ax1.set_title('Best-selling Products by Quantity')
ax1.set_ylabel('Total Quantity Sold')
best_selling_revenue.plot(kind='bar', ax=ax2)
ax2.set_title('Best-selling Products by Revenue')
ax2.set_ylabel('Total Revenue')
plt.tight_layout()
plt.show()

print("Average Order Value by Category:")
print(avg_order_value)

print("\nTop 10 Product Pairs:")
print(product_pairs)

## 4. Order Trends

In [None]:
# Analyze daily, weekly, and monthly order trends
daily_orders = orders.groupby('order_date').size().reset_index(name='order_count')
weekly_orders = orders.groupby(orders['order_date'].dt.to_period('W')).size().reset_index(name='order_count')
weekly_orders['order_date'] = weekly_orders['order_date'].dt.to_timestamp()
monthly_orders = orders.groupby(orders['order_date'].dt.to_period('M')).size().reset_index(name='order_count')
monthly_orders['order_date'] = monthly_orders['order_date'].dt.to_timestamp()

# Calculate average time between orders for each customer
customer_orders = orders.sort_values(['customer_id', 'order_date'])
customer_orders['prev_order_date'] = customer_orders.groupby('customer_id')['order_date'].shift(1)
customer_orders['days_between_orders'] = (customer_orders['order_date'] - customer_orders['prev_order_date']).dt.days
avg_time_between_orders = customer_orders.groupby('customer_id')['days_between_orders'].mean()

# Identify seasonality in order patterns
orders['month'] = orders['order_date'].dt.month
monthly_pattern = orders.groupby('month').size().reset_index(name='order_count')

# Visualize results
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 15))
ax1.plot(daily_orders['order_date'], daily_orders['order_count'])
ax1.set_title('Daily Order Trends')
ax1.set_xlabel('Date')
ax1.set_ylabel('Order Count')

ax2.plot(weekly_orders['order_date'], weekly_orders['order_count'])
ax2.set_title('Weekly Order Trends')
ax2.set_xlabel('Date')
ax2.set_ylabel('Order Count')

ax3.plot(monthly_orders['order_date'], monthly_orders['order_count'])
ax3.set_title('Monthly Order Trends')
ax3.set_xlabel('Date')
ax3.set_ylabel('Order Count')

plt.tight_layout()
plt.show()

print("Average time between orders (top 10 customers):")
print(avg_time_between_orders.nlargest(10))

plt.figure(figsize=(10, 5))
sns.barplot(x='month', y='order_count', data=monthly_pattern)
plt.title('Monthly Order Pattern')
plt.xlabel('Month')
plt.ylabel('Order Count')
plt.show()

## 5. Website Traffic Analysis

In [None]:
# Correlate website traffic data with daily order volumes
traffic_orders = website_traffic.merge(daily_orders, left_on='date', right_on='order_date', how='left')
traffic_orders['order_count'] = traffic_orders['order_count'].fillna(0)

correlation = traffic_orders[['page_views', 'unique_visitors', 'order_count']].corr()

# Calculate conversion rates
traffic_orders['conversion_rate'] = traffic_orders['order_count'] / traffic_orders['unique_visitors']

# Identify peak traffic times
peak_traffic = traffic_orders.nlargest(10, 'page_views')

# Visualize results
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
ax1.plot(traffic_orders['date'], traffic_orders['page_views'], label='Page Views')
ax1.plot(traffic_orders['date'], traffic_orders['unique_visitors'], label='Unique Visitors')
ax1.set_title('Website Traffic Over Time')
ax1.set_xlabel('Date')
ax1.set_ylabel('Count')
ax1.legend()

ax2.plot(traffic_orders['date'], traffic_orders['conversion_rate'])
ax2.set_title('Conversion Rate Over Time')
ax2.set_xlabel('Date')
ax2.set_ylabel('Conversion Rate')

plt.tight_layout()
plt.show()

print("Correlation between traffic and orders:")
print(correlation)

print("\nPeak traffic times:")
print(peak_traffic[['date', 'page_views', 'unique_visitors', 'order_count']])

## 6. Cohort Analysis

In [None]:
# Prepare data for cohort analysis
cohort_data = orders.merge(customers[['customer_id', 'registration_date']], on='customer_id')
cohort_data['cohort'] = cohort_data['registration_date'].dt.to_period('M')
cohort_data['order_month'] = cohort_data['order_date'].dt.to_period('M')
cohort_data['cohort_index'] = (cohort_data['order_month'] - cohort_data['cohort']).apply(lambda x: x.n)

# Create cohort table
cohort_table = cohort_data.groupby(['cohort', 'cohort_index']).agg({
    'customer_id': pd.Series.nunique,
    'order_id': 'count'
}).reset_index()

cohort_table.columns = ['cohort', 'cohort_index', 'customers', 'orders']
cohort_size = cohort_table.groupby('cohort')['customers'].first()
cohort_table['customer_retention'] = cohort_table['customers'] / cohort_size

# Pivot table for visualization
cohort_pivot = cohort_table.pivot(index='cohort', columns='cohort_index', values='customer_retention')

# Visualize cohort analysis
plt.figure(figsize=(12, 8))
sns.heatmap(cohort_pivot, annot=True, cmap='YlGnBu', fmt='.2%')
plt.title('Customer Cohort Analysis - Retention Rates')
plt.xlabel('Cohort Index (Months)')
plt.ylabel('Cohort (Registration Month)')
plt.show()

print("Cohort Analysis Summary:")
print(cohort_pivot.iloc[:5, :5])  # Display first 5 cohorts and 5 months

## 7. Reporting and Visualization

In [None]:
# Create summary DataFrame
summary = pd.DataFrame({
    'Total Customers': [len(customers)],
    'Total Orders': [len(orders)],
    'Total Revenue': [order_values['total_value'].sum()],
    'Average Order Value': [order_values['total_value'].mean()],
    'Best Selling Product': [best_selling_quantity.index[0]],
    'Most Revenue Product': [best_selling_revenue.index[0]],
    'Average Time Between Orders (days)': [avg_time_between_orders.mean()],
    'Average Conversion Rate': [traffic_orders['conversion_rate'].mean()]
})

print("E-commerce Data Analysis Summary:")
print(summary.T)

# Additional visualizations
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))

# Revenue by category
product_sales.groupby('category')['revenue'].sum().plot(kind='pie', autopct='%1.1f%%', ax=ax1)
ax1.set_title('Revenue by Category')

# Monthly order trend
monthly_orders.plot(x='order_date', y='order_count', ax=ax2)
ax2.set_title('Monthly Order Trend')
ax2.set_xlabel('Date')
ax2.set_ylabel('Order Count')

# Customer distribution by lifetime value
sns.histplot(customer_ltv['lifetime_value'], kde=True, ax=ax3)
ax3.set_title('Customer Distribution by Lifetime Value')
ax3.set_xlabel('Lifetime Value')
ax3.set_ylabel('Count')

plt.tight_layout()
plt.show()

## Conclusion and Insights

Based on our analysis of the e-commerce data, we can draw the following insights:

1. Customer Behavior:
   - The top 10% of customers contribute significantly to overall revenue.
   - Customer registration shows seasonal patterns, with peaks during certain months.
   - There's a wide range in customer lifetime values, suggesting opportunities for targeted marketing.

2. Product Performance:
   - Certain products consistently outperform others in both quantity sold and revenue generated.
   - There are clear differences in average order value across product categories.
   - Some product pairs are frequently bought together, indicating potential for bundle offers.

3. Order Trends:
   - Order volume shows both weekly and monthly seasonality.
   - The average time between orders varies significantly among customers.
   - Certain months consistently show higher order volumes, suggesting seasonal demand.

4. Website Traffic and Conversion:
   - There's a positive correlation between website traffic and order volume.
   - Conversion rates fluctuate over time, with some periods showing notably higher rates.
   - Peak traffic times don't always correspond to peak order times, indicating potential for optimization.

5. Cohort Analysis:
   - Customer retention rates vary across cohorts and over time.
   - Some cohorts show better long-term retention than others, warranting further investigation into onboarding and engagement strategies.

Recommendations:
1. Implement targeted marketing campaigns for high-value customers and strategies to increase the lifetime value of others.
2. Optimize inventory based on product performance and consider creating bundles for frequently co-purchased items.
3. Prepare for seasonal fluctuations in demand and adjust marketing efforts accordingly.
4. Investigate factors contributing to higher conversion rates during peak periods and apply these insights to improve overall conversion.
5. Develop strategies to improve customer retention, especially focusing on cohorts with lower long-term retention rates.

This analysis provides a solid foundation for data-driven decision-making in various aspects of the e-commerce business, from marketing and sales to inventory management and customer service.