In [3]:
# Canadian Grocery Demand Forecasting - Data Generation

print("CANADIAN GROCERY DEMAND FORECASTING PROJECT")


# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import random
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

print("Libraries imported successfully!")
print(f"Project started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Python version: {pd.__version__} (Pandas)")

CANADIAN GROCERY DEMAND FORECASTING PROJECT
Libraries imported successfully!
Project started: 2025-09-05 13:24:29
Python version: 2.3.2 (Pandas)


In [8]:
def generate_canadian_grocery_data(n_stores=25, n_products=500, days=730):
    """
    Generate realistic Canadian grocery store dataset
    Includes Canadian-specific chains, provinces, and seasonal patterns
    """
    print("Generating Canadian grocery store dataset...")
    
    # Canadian store chains
    store_chains = ['Loblaws', 'Metro', 'Sobeys', 'FreshCo', 'No Frills', 'IGA', 'Thrifty Foods']
    provinces = ['ON', 'QC', 'BC', 'AB', 'MB', 'SK', 'NS', 'NB', 'NL', 'PE']
    
    # Canadian grocery categories with realistic products
    canadian_products = {
        'Dairy': {
            'products': ['Milk 1L', 'Milk 2L', 'Milk 4L', 'Cheese Cheddar 400g', 'Greek Yogurt 750g', 
                        'Butter 454g', 'Cream 35% 473ml', 'Cottage Cheese 500g'],
            'base_prices': [1.99, 3.49, 5.99, 6.99, 4.99, 4.49, 3.99, 3.49]
        },
        'Meat': {
            'products': ['Chicken Breast/kg', 'Ground Beef/kg', 'Pork Chops/kg', 'Salmon Fillet/kg', 
                        'Turkey Slices 175g', 'Bacon 375g', 'Ground Turkey/kg'],
            'base_prices': [13.99, 8.99, 11.99, 24.99, 4.99, 5.99, 10.99]
        },
        'Produce': {
            'products': ['Bananas/kg', 'Apples Gala/kg', 'Carrots 2lb bag', 'Potatoes 5lb bag', 
                        'Onions Yellow/kg', 'Broccoli Crown', 'Lettuce Iceberg'],
            'base_prices': [1.58, 4.99, 2.99, 3.99, 2.99, 2.49, 2.99]
        },
        'Bakery': {
            'products': ['White Bread 675g', 'Whole Wheat Bread 675g', 'Bagels 6pk', 'Croissants 4pk', 
                        'Muffins Blueberry 4pk', 'Dinner Rolls 8pk'],
            'base_prices': [2.99, 3.49, 3.99, 4.99, 4.49, 2.99]
        },
        'Frozen': {
            'products': ['Ice Cream 1L', 'Frozen Pizza 400g', 'Frozen Vegetables 750g', 'Frozen Berries 600g', 
                        'Frozen Fish Fillets 400g', 'Frozen French Fries 1kg'],
            'base_prices': [5.99, 4.99, 3.99, 6.99, 7.99, 3.49]
        },
        'Pantry': {
            'products': ['Rice Jasmine 1kg', 'Pasta 500g', 'Cereal 525g', 'Canned Tomatoes 796ml', 
                        'Olive Oil 500ml', 'Peanut Butter 1kg', 'Oats 1kg'],
            'base_prices': [3.99, 1.99, 4.99, 1.99, 7.99, 6.99, 4.49]
        }
    }
    
    # Generate stores
    print("Creating store data...")
    stores = []
    for i in range(n_stores):
        stores.append({
            'store_id': f'ST_{i+1:03d}',
            'chain': random.choice(store_chains),
            'province': random.choice(provinces),
            'city': f'City_{chr(65+i%26)}{i//26+1}',
            'store_size': random.choice(['Small', 'Medium', 'Large']),
            'population_density': random.choice(['Urban', 'Suburban', 'Rural'])
        })
    
    # Generate products
    print("Creating product catalog...")
    products = []
    product_id = 1
    
    for category, cat_data in canadian_products.items():
        for product_name, base_price in zip(cat_data['products'], cat_data['base_prices']):
            for brand in ['President\'s Choice', 'No Name', 'National Brand']:
                if product_id <= n_products:
                    products.append({
                        'product_id': f'PR_{product_id:04d}',
                        'product_name': f'{brand} {product_name}',
                        'category': category,
                        'brand': brand,
                        'base_price': base_price
                    })
                    product_id += 1
    
    # Generate sales transactions
    print("Generating sales transactions...")
    start_date = datetime(2022, 1, 1)
    transactions = []
    
    # Canadian holidays for seasonal effects
    canadian_holidays = {
        (1, 1): 2.0,   # New Year's Day
        (2, 14): 1.3,  # Valentine's Day
        (3, 17): 1.2,  # St. Patrick's Day
        (7, 1): 1.8,   # Canada Day
        (10, 31): 1.4, # Halloween
        (11, 11): 1.1, # Remembrance Day
        (12, 25): 2.5, # Christmas
        (12, 26): 2.2, # Boxing Day
    }
    
    for day in range(days):
        current_date = start_date + timedelta(days=day)
        month = current_date.month
        day_of_week = current_date.weekday()
        
        # Seasonal multipliers for Canadian climate
        seasonal_multipliers = {
            12: 1.6, 1: 1.3, 2: 1.0,  # Winter - higher demand
            3: 1.1, 4: 1.2, 5: 1.3,   # Spring - increasing
            6: 1.4, 7: 1.5, 8: 1.4,   # Summer - peak season
            9: 1.2, 10: 1.3, 11: 1.4  # Fall - holiday prep
        }
        
        weekend_multiplier = 1.4 if day_of_week >= 5 else 1.0
        seasonal_mult = seasonal_multipliers[month]
        holiday_mult = canadian_holidays.get((month, current_date.day), 1.0)
        
        # Generate transactions for random stores and products
        num_transactions = random.randint(50, 200)  # Variable daily activity
        
        for _ in range(num_transactions):
            store = random.choice(stores)
            product = random.choice(products[:min(len(products), n_products)])
            
            # Store size impact
            store_multiplier = {'Small': 0.7, 'Medium': 1.0, 'Large': 1.5}[store['store_size']]
            location_multiplier = {'Urban': 1.3, 'Suburban': 1.0, 'Rural': 0.8}[store['population_density']]
            
            # Base demand calculation
            base_demand = random.randint(1, 25)
            total_demand = int(base_demand * weekend_multiplier * seasonal_mult * 
                             holiday_mult * store_multiplier * location_multiplier)
            
            if total_demand > 0:
                # Price variations
                price_variation = random.uniform(0.9, 1.15)
                current_price = round(product['base_price'] * price_variation, 2)
                
                # Promotion logic
                is_promotion = random.random() < 0.12  # 12% chance
                if is_promotion:
                    current_price *= 0.8  # 20% discount
                    total_demand = int(total_demand * 1.25)  # 25% demand boost
                
                transactions.append({
                    'date': current_date.strftime('%Y-%m-%d'),
                    'store_id': store['store_id'],
                    'product_id': product['product_id'],
                    'product_name': product['product_name'],
                    'category': product['category'],
                    'brand': product['brand'],
                    'sales_quantity': total_demand,
                    'price': current_price,
                    'revenue': round(total_demand * current_price, 2),
                    'promotion_flag': int(is_promotion),
                    'chain': store['chain'],
                    'province': store['province'],
                    'store_size': store['store_size'],
                    'population_density': store['population_density']
                })
    
    # Convert to DataFrames
    sales_df = pd.DataFrame(transactions)
    stores_df = pd.DataFrame(stores)
    products_df = pd.DataFrame(products)
    
    print(f"Dataset generation complete!")
    print(f"Generated {len(sales_df):,} sales transactions")
    print(f"{len(stores_df)} stores across {stores_df['province'].nunique()} provinces")
    print(f"{len(products_df)} products in {products_df['category'].nunique()} categories")
    
    return sales_df, stores_df, products_df

# Generate the dataset
sales_df, stores_df, products_df = generate_canadian_grocery_data(
    n_stores=25, 
    n_products=300, 
    days=365*2  # 2 years of data
)

Generating Canadian grocery store dataset...
Creating store data...
Creating product catalog...
Generating sales transactions...
Dataset generation complete!
Generated 89,482 sales transactions
25 stores across 10 provinces
123 products in 6 categories


In [6]:
# DATA EXPLORATION AND VISUALIZATION

print("EXPLORING THE GENERATED DATASET")
print("="*50)

# Basic statistics
print(f"Sales Data Shape: {sales_df.shape}")
print(f"Date Range: {sales_df['date'].min()} to {sales_df['date'].max()}")
print(f"Total Revenue: ${sales_df['revenue'].sum():,.2f}")
print(f"Total Units Sold: {sales_df['sales_quantity'].sum():,}")

# Display sample data
print("\nSample Sales Data:")
display(sales_df.head(10))

print("\nStore Information:")
display(stores_df.head())

print("\nProduct Catalog Sample:")
display(products_df.head())

# Quick statistics
print("\nKEY STATISTICS:")
print(f"Average daily sales per store: {sales_df.groupby(['store_id', 'date'])['sales_quantity'].sum().mean():.1f} units")
print(f"Average order value: ${sales_df['revenue'].sum() / len(sales_df):.2f}")
print(f"Most popular category: {sales_df['category'].value_counts().index[0]}")
print(f"Top performing chain: {sales_df.groupby('chain')['revenue'].sum().idxmax()}")

EXPLORING THE GENERATED DATASET
Sales Data Shape: (89197, 14)
Date Range: 2022-01-01 to 2023-12-31
Total Revenue: $10,229,992.87
Total Units Sold: 1,840,703

Sample Sales Data:


Unnamed: 0,date,store_id,product_id,product_name,category,brand,sales_quantity,price,revenue,promotion_flag,chain,province,store_size,population_density
0,2022-01-01,ST_025,PR_0028,President's Choice Ground Beef/kg,Meat,President's Choice,68,7.072,480.9,1,IGA,PE,Medium,Rural
1,2022-01-01,ST_021,PR_0122,No Name Oats 1kg,Pantry,No Name,54,4.39,237.06,0,FreshCo,PE,Large,Suburban
2,2022-01-01,ST_005,PR_0099,National Brand Frozen Fish Fillets 400g,Frozen,National Brand,66,8.79,580.14,0,No Frills,MB,Small,Urban
3,2022-01-01,ST_013,PR_0102,National Brand Frozen French Fries 1kg,Frozen,National Brand,23,3.25,74.75,0,Loblaws,MB,Medium,Rural
4,2022-01-01,ST_016,PR_0066,National Brand Lettuce Iceberg,Produce,National Brand,156,2.9,452.4,0,Sobeys,BC,Large,Urban
5,2022-01-01,ST_007,PR_0105,National Brand Rice Jasmine 1kg,Pantry,National Brand,14,3.92,54.88,0,Sobeys,QC,Medium,Rural
6,2022-01-01,ST_006,PR_0061,President's Choice Broccoli Crown,Produce,President's Choice,76,2.26,171.76,0,FreshCo,AB,Large,Suburban
7,2022-01-01,ST_019,PR_0064,President's Choice Lettuce Iceberg,Produce,President's Choice,50,2.85,142.5,0,IGA,NL,Small,Suburban
8,2022-01-01,ST_008,PR_0005,No Name Milk 2L,Dairy,No Name,57,2.88,164.16,1,No Frills,PE,Small,Urban
9,2022-01-01,ST_017,PR_0070,President's Choice Whole Wheat Bread 675g,Bakery,President's Choice,76,3.35,254.6,0,FreshCo,NB,Medium,Suburban



Store Information:


Unnamed: 0,store_id,chain,province,city,store_size,population_density
0,ST_001,Loblaws,MB,City_A1,Medium,Rural
1,ST_002,FreshCo,NB,City_B1,Large,Rural
2,ST_003,Metro,ON,City_C1,Large,Suburban
3,ST_004,Metro,PE,City_D1,Small,Rural
4,ST_005,No Frills,MB,City_E1,Small,Urban



Product Catalog Sample:


Unnamed: 0,product_id,product_name,category,brand,base_price
0,PR_0001,President's Choice Milk 1L,Dairy,President's Choice,1.99
1,PR_0002,No Name Milk 1L,Dairy,No Name,1.99
2,PR_0003,National Brand Milk 1L,Dairy,National Brand,1.99
3,PR_0004,President's Choice Milk 2L,Dairy,President's Choice,3.49
4,PR_0005,No Name Milk 2L,Dairy,No Name,3.49



KEY STATISTICS:
Average daily sales per store: 103.3 units
Average order value: $114.69
Most popular category: Dairy
Top performing chain: No Frills


In [7]:
# 1. Daily sales trend
daily_sales = sales_df.groupby('date').agg({
    'sales_quantity': 'sum',
    'revenue': 'sum'
}).reset_index()
daily_sales['date'] = pd.to_datetime(daily_sales['date'])

fig1 = px.line(daily_sales, x='date', y='sales_quantity', 
               title='Daily Sales Volume Across All Stores',
               labels={'sales_quantity': 'Total Units Sold', 'date': 'Date'})
fig1.update_layout(showlegend=False, height=400)
fig1.show()

# 2. Revenue by category
category_revenue = sales_df.groupby('category')['revenue'].sum().sort_values(ascending=True)
fig2 = px.bar(x=category_revenue.values, y=category_revenue.index, 
              orientation='h',
              title='Total Revenue by Product Category',
              labels={'x': 'Revenue ($)', 'y': 'Category'})
fig2.update_layout(height=400)
fig2.show()

# 3. Provincial performance
province_stats = sales_df.groupby('province').agg({
    'revenue': 'sum',
    'sales_quantity': 'sum'
}).reset_index()

fig3 = px.scatter(province_stats, x='sales_quantity', y='revenue', 
                  text='province', size='revenue',
                  title='🗺️ Provincial Performance: Revenue vs Volume',
                  labels={'sales_quantity': 'Total Units Sold', 'revenue': 'Total Revenue ($)'})
fig3.update_traces(textposition="middle center")
fig3.update_layout(height=500)
fig3.show()

# 4. Seasonal patterns
sales_df['date'] = pd.to_datetime(sales_df['date'])
sales_df['month'] = sales_df['date'].dt.month
monthly_sales = sales_df.groupby('month')['sales_quantity'].sum()

fig4 = px.bar(x=monthly_sales.index, y=monthly_sales.values,
              title='Seasonal Sales Patterns (Monthly)',
              labels={'x': 'Month', 'y': 'Total Units Sold'})
fig4.update_layout(height=400)
fig4.show()

print("Visualizations complete! Dataset looks realistic with Canadian patterns.")

Visualizations complete! Dataset looks realistic with Canadian patterns.


In [10]:
# SAVEING GENERATED DATA

import os

# Create data directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

# Save the datasets
sales_df.to_csv('data/raw/sales_data.csv', index=False)
stores_df.to_csv('data/raw/stores_data.csv', index=False)
products_df.to_csv('data/raw/products_data.csv', index=False)

print("Data saved successfully!")
print("Files saved:")
print("   - data/raw/sales_data.csv")
print("   - data/raw/stores_data.csv") 
print("   - data/raw/products_data.csv")

print(f"\nDATA GENERATION COMPLETE!")


Data saved successfully!
Files saved:
   - data/raw/sales_data.csv
   - data/raw/stores_data.csv
   - data/raw/products_data.csv

DATA GENERATION COMPLETE!
