In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
df = pd.read_csv('purchase_data.csv')

# Preview the dataset
print("Dataset Preview:")
print(df.head())
print("\nDataset Info:")
print(df.info())

# Handle missing values
df.dropna(inplace=True)
print("\nMissing Values After Dropping Rows with NaN:")
print(df.isnull().sum())

# Convert data types
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Filter out invalid transactions
df = df[df['Quantity'] > 0]
df = df[df['UnitPrice'] > 0]

# Generate a Month-Year column for monthly analysis
df['MonthYear'] = df['InvoiceDate'].dt.to_period('M')

# 1. Monthly fluctuations in total revenue and transactions
monthly_data = df.groupby('MonthYear').agg(
    TotalRevenue=('TotalPrice', 'sum'),
    TransactionCount=('InvoiceNo', 'nunique')
).reset_index()

# Plot monthly revenue and transaction count
fig, ax1 = plt.subplots(figsize=(12, 6))

ax2 = ax1.twinx()
sns.lineplot(x='MonthYear', y='TotalRevenue', data=monthly_data, ax=ax1, label='Total Revenue', color='blue')
sns.lineplot(x='MonthYear', y='TransactionCount', data=monthly_data, ax=ax2, label='Transactions', color='orange')

ax1.set_ylabel('Total Revenue')
ax2.set_ylabel('Number of Transactions')
plt.title('Monthly Revenue and Transactions Over Time')
plt.show()

# 2. Product categories with highest and growing revenues
category_data = df.groupby('Description').agg(
    TotalRevenue=('TotalPrice', 'sum'),
    MonthlyRevenue=('TotalPrice', 'mean')
).sort_values(by='TotalRevenue', ascending=False).reset_index()

# Plot top 10 categories by revenue
top_categories = category_data.head(10)
sns.barplot(x='TotalRevenue', y='Description', data=top_categories, palette='viridis')
plt.title('Top 10 Product Categories by Total Revenue')
plt.show()

# Trend analysis for top categories
for category in top_categories['Description']:
    category_trend = df[df['Description'] == category].groupby('MonthYear')['TotalPrice'].sum().reset_index()
    sns.lineplot(x='MonthYear', y='TotalPrice', data=category_trend, label=category)

plt.title('Revenue Trends for Top Product Categories')
plt.legend()
plt.show()

# 3. Seasonal sales variations
df['Month'] = df['InvoiceDate'].dt.month
seasonal_data = df.groupby(['Month', 'Description']).agg(
    MonthlyRevenue=('TotalPrice', 'sum')
).reset_index()

# Plot seasonal patterns for top categories
for category in top_categories['Description']:
    seasonal_trend = seasonal_data[seasonal_data['Description'] == category]
    sns.lineplot(x='Month', y='MonthlyRevenue', data=seasonal_trend, label=category)

plt.title('Seasonal Sales Variations by Category')
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.legend()
plt.show()

# 4. Customer purchasing behavior
customer_behavior = df.groupby('CustomerID').agg(
    TotalRevenue=('TotalPrice', 'sum'),
    AverageOrderValue=('TotalPrice', 'mean'),
    NumberOfTransactions=('InvoiceNo', 'nunique')
).reset_index()

# Identify customers with multiple purchases
repeat_customers = customer_behavior[customer_behavior['NumberOfTransactions'] > 1]

# Plot distribution of average order values
sns.histplot(repeat_customers['AverageOrderValue'], bins=20, kde=True, color='purple')
plt.title('Distribution of Average Order Values (Repeat Customers)')
plt.show()

# Recency-Frequency-Monetary Analysis (RFM)
rfm = df.groupby('CustomerID').agg(
    Recency=('InvoiceDate', lambda x: (df['InvoiceDate'].max() - x.max()).days),
    Frequency=('InvoiceNo', 'nunique'),
    MonetaryValue=('TotalPrice', 'sum')
).reset_index()

sns.scatterplot(x='Frequency', y='MonetaryValue', hue='Recency', data=rfm, palette='coolwarm')
plt.title('Customer Segmentation Based on RFM')
plt.show()

# Summary and recommendations
print("\nSummary of Insights:")
print("- Monthly revenue trends show significant seasonal variations in sales.")
print("- Top product categories contribute disproportionately to revenue.")
print("- Seasonal trends indicate time-sensitive products that could benefit from targeted promotions.")
print("- RFM analysis reveals customer segments for personalized marketing strategies.")


Dataset Preview:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

        InvoiceDate  UnitPrice  CustomerID         Country  
0  01/12/2010 08:26       2.55     17850.0  United Kingdom  
1  01/12/2010 08:26       3.39     17850.0  United Kingdom  
2  01/12/2010 08:26       2.75     17850.0  United Kingdom  
3  01/12/2010 08:26       3.39     17850.0  United Kingdom  
4  01/12/2010 08:26       3.39     17850.0  United Kingdom  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------     

ValueError: time data "13/12/2010 09:02" doesn't match format "%m/%d/%Y %H:%M", at position 927. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.