# Dataset Creation Progression

## 1. Objective
The objective is to develop a synthetic dataset that simulates demand patterns for a variety of products, incorporating key factors that influence demand, such as economic indicators, stock levels, and promotional activities.

## 2. Data Features
The dataset includes the following features:

- **Product Features**:
  - `product_id`: Unique identifier for each product.
  - `current_stock`: Current inventory levels for each product.
  - `max_stock`: Maximum stock level for each product.
  - `reorder_point`: Stock level at which new orders should be placed.
  - `lead_time`: Time taken to restock the product.

- **Demand Features**:
  - `demand`: Actual demand for the product.
  - `demand_rolling_mean`: Rolling mean of demand over a specified window.
  - `demand_rolling_std`: Rolling standard deviation of demand over a specified window.
  - `demand_lag_1`, `demand_lag_2`, `demand_lag_3`: Lagged demand values for time series analysis.

- **Temporal Features**:
  - `date`: Timestamp for each record.
  - `day_of_week`: Day of the week extracted from the date.
  - `month`: Month extracted from the date.
  - `year`: Year extracted from the date.
  - `quarter`: Quarter of the year.

- **Economic Indicators**:
  - `unemployment_rate`: Current unemployment rate.
  - `consumer_confidence`: Index of consumer confidence.
  - `inflation_rate`: Rate of inflation.

- **Promotional and Seasonal Indicators**:
  - `is_promotion`: Indicator of whether the product is on promotion.
  - `is_holiday`: Indicator of whether the date falls on a holiday.
  - `is_peak_season`: Indicator of whether the date falls within peak demand season.

## 3. Data Generation Steps
### Step 1: Define Parameters
Set the initial parameters, including the number of products, time period, and the range of values for economic indicators and demand patterns.

```python
num_products = 10
num_days = 365


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Set the random seed for reproducibility
np.random.seed(42)

# Function to create a synthetic dataset
def create_synthetic_demand_data(start_date, end_date, num_products):
    date_range = pd.date_range(start=start_date, end=end_date)
    data = []

    for product_id in range(1, num_products + 1):
        base_demand = 50 + 10 * product_id  # Base demand for the product
        seasonal_effect = (
            20 * np.sin(2 * np.pi * (date_range.dayofyear - 1) / 365)  # Seasonal pattern
        )
        trend = 0.05 * (date_range.year - 2019) * base_demand  # Linear trend over years
        price_effect = 5 * (np.random.rand(len(date_range)) > 0.5)  # Random price changes
        economic_index = np.random.normal(0, 1, len(date_range))  # Random economic index

        # Generate synthetic demand with more realistic variability
        demand = (
            base_demand
            + seasonal_effect
            + trend
            + price_effect
            + 10 * np.random.normal(0, 1, len(date_range))  # Noise
        )

        # Create DataFrame
        df = pd.DataFrame({
            'date': date_range,
            'product_id': f'P{product_id:03}',
            'demand': np.maximum(0, demand.round())  # Ensure demand is non-negative
        })

        # Add special events and holidays to boost demand
        holidays = pd.to_datetime([
            '2024-01-01', '2024-08-15', '2024-12-25',
            '2023-01-01', '2023-08-15', '2023-12-25',
            '2022-01-01', '2022-08-15', '2022-12-25',
            '2021-01-01', '2021-08-15', '2021-12-25',
            '2020-01-01', '2020-08-15', '2020-12-25',
            '2019-01-01', '2019-08-15', '2019-12-25'
        ])  # Add more holidays if necessary

        # Boost demand for holidays
        df.loc[df['date'].isin(holidays), 'demand'] += 30  # Boost for holidays

        # Add lagged demand feature
        df['lagged_demand'] = df['demand'].shift(1)

        # Add moving average feature
        df['moving_average'] = df['demand'].rolling(window=7).mean()

        data.append(df)

    # Combine all product data into a single DataFrame
    synthetic_data = pd.concat(data)

    # Remove NaN values created by lag and rolling features
    synthetic_data.dropna(inplace=True)

    return synthetic_data

# Create synthetic demand data for the years 2019 to 2024
synthetic_demand_data = create_synthetic_demand_data("2019-01-01", "2024-12-31", 10)

# Save to CSV
synthetic_demand_data.to_csv("synthetic_demand_data.csv", index=False)

# Display the first few rows of the dataset
print(synthetic_demand_data.head())


         date product_id  demand  lagged_demand  moving_average
6  2019-01-07       P001    78.0           52.0       70.857143
7  2019-01-08       P001    64.0           78.0       66.571429
8  2019-01-09       P001    81.0           64.0       68.285714
9  2019-01-10       P001    54.0           81.0       67.428571
10 2019-01-11       P001    69.0           54.0       68.000000


good dataset


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Set the random seed for reproducibility
np.random.seed(42)

def create_synthetic_demand_data(start_date, end_date, num_products):
    # Create date range
    date_range = pd.date_range(start=start_date, end=end_date)

    # Define holidays
    holidays = pd.to_datetime([
        '2024-01-01', '2024-12-25', '2023-01-01', '2023-12-25',
        '2022-01-01', '2022-12-25', '2021-01-01', '2021-12-25',
        '2020-01-01', '2020-12-25'
    ])

    all_data = []

    for product_id in range(1, num_products + 1):
        # Create data for each date
        for date in date_range:
            base_demand = 50 + 10 * product_id

            # Calculate effects
            seasonal_effect = 20 * np.sin(2 * np.pi * date.dayofyear / 365)
            trend = 0.05 * (date.year - 2019) * base_demand
            price_effect = 5 if np.random.rand() > 0.5 else 0
            noise = 10 * np.random.normal()

            # Calculate demand
            demand = base_demand + seasonal_effect + trend + price_effect + noise

            # Add holiday effect
            if date in holidays:
                demand += 30

            # Ensure non-negative demand
            demand = max(0, demand)

            # Generate other features
            initial_stock = 100 + product_id * 10
            restock_amount = np.random.randint(10, 50)
            economic_index = np.random.normal()

            # Append to data list
            all_data.append({
                'date': date,
                'product_id': f'P{product_id:03}',
                'demand': round(demand),
                'initial_stock': initial_stock,
                'restock_amount': restock_amount,
                'economic_index': economic_index
            })

    # Create DataFrame from all data
    df = pd.DataFrame(all_data)

    # Sort by date and product_id
    df = df.sort_values(['date', 'product_id']).reset_index(drop=True)

    # Calculate lagged features by product
    df['lagged_demand'] = df.groupby('product_id')['demand'].shift(1)
    df['moving_average'] = df.groupby('product_id')['demand'].transform(
        lambda x: x.rolling(window=7, min_periods=1).mean()
    )

    # Calculate current stock
    def calculate_current_stock(group):
        stock = group['initial_stock'].iloc[0]
        current_stock = []
        for _, row in group.iterrows():
            stock += row['restock_amount']
            stock -= row['demand']
            stock = max(0, stock)
            current_stock.append(stock)
        return current_stock

    df['current_stock'] = df.groupby('product_id').apply(
        calculate_current_stock
    ).explode().values

    return df

# Create synthetic demand data
synthetic_demand_data = create_synthetic_demand_data("2019-01-01", "2024-12-31", 10)

# Save to CSV
synthetic_demand_data.to_csv("synthetic_demand_data.csv", index=False)

# Display the first few rows
print(synthetic_demand_data.head())
print("\nDataset shape:", synthetic_demand_data.shape)
print("\nColumns:", synthetic_demand_data.columns.tolist())

        date product_id  demand  initial_stock  restock_amount  \
0 2019-01-01       P001      49            110              28   
1 2019-01-01       P002      74            120              40   
2 2019-01-01       P003      88            130              14   
3 2019-01-01       P004      78            140              33   
4 2019-01-01       P005     107            150              36   

   economic_index  lagged_demand  moving_average current_stock  
0        0.318902            NaN            49.0            89  
1        0.837870            NaN            74.0            97  
2        0.352741            NaN            88.0            66  
3       -1.129739            NaN            78.0            36  
4        0.044834            NaN           107.0             0  

Dataset shape: (21920, 9)

Columns: ['date', 'product_id', 'demand', 'initial_stock', 'restock_amount', 'economic_index', 'lagged_demand', 'moving_average', 'current_stock']


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Set the random seed for reproducibility
np.random.seed(42)

def create_synthetic_demand_data(start_date, end_date, num_products):
    date_range = pd.date_range(start=start_date, end=end_date)

    # Define holidays for demand boosts
    holidays = pd.to_datetime([
        '2024-01-01', '2024-01-26', '2024-03-25', '2024-08-15',
        '2024-10-02', '2024-11-12', '2024-12-25', '2023-01-01',
        '2023-01-26', '2023-03-22', '2023-08-15', '2023-10-02',
        '2023-11-12', '2023-12-25', '2022-01-01', '2022-01-26',
        '2022-03-01', '2022-08-15', '2022-10-02', '2022-11-12',
        '2022-12-25', '2021-01-01', '2021-01-26', '2021-03-11',
        '2021-08-15', '2021-10-02', '2021-11-12', '2021-12-25',
        '2020-01-01', '2020-01-26', '2020-03-02', '2020-08-15',
        '2020-10-02', '2020-11-12', '2020-12-25'
    ])

    all_data = []

    for product_id in range(1, num_products + 1):
        base_demand = 50 + 10 * product_id

        # Create arrays for each effect
        seasonal_effect = 20 * np.sin(2 * np.pi * np.array([d.dayofyear for d in date_range]) / 365)
        trend = 0.05 * np.array([(d.year - 2019) for d in date_range]) * base_demand
        price_effect = 5 * (np.random.rand(len(date_range)) > 0.5)
        noise = 10 * np.random.normal(0, 1, len(date_range))
        economic_index = np.random.normal(0, 1, len(date_range))

        # Calculate demand
        demand = base_demand + seasonal_effect + trend + price_effect + noise

        # Add holiday effects
        holiday_boost = np.zeros(len(date_range))
        for i, date in enumerate(date_range):
            if date in holidays:
                holiday_boost[i] = 30
        demand += holiday_boost

        # Ensure demand is non-negative
        demand = np.maximum(0, demand)

        # Create initial stock and restock amounts
        initial_stock = 100 + product_id * 10
        restock_amount = np.random.randint(10, 50, len(date_range))

        # Calculate current stock
        current_stock = np.zeros(len(date_range))
        stock = initial_stock
        for i in range(len(date_range)):
            stock += restock_amount[i]
            stock -= demand[i]
            current_stock[i] = max(0, stock)

        # Create data for this product
        product_data = pd.DataFrame({
            'date': date_range,
            'product_id': f'P{product_id:03}',
            'demand': np.round(demand),
            'initial_stock': initial_stock,
            'restock_amount': restock_amount,
            'current_stock': current_stock,
            'economic_index': economic_index
        })

        all_data.append(product_data)

    # Combine all product data
    synthetic_data = pd.concat(all_data, ignore_index=True)

    # Add lagged demand and moving average features
    synthetic_data['lagged_demand'] = synthetic_data.groupby('product_id')['demand'].shift(1)
    synthetic_data['moving_average'] = synthetic_data.groupby('product_id')['demand'].transform(
        lambda x: x.rolling(window=7, min_periods=1).mean()
    )

    # Remove NaN values
    synthetic_data.dropna(inplace=True)

    return synthetic_data

# Create synthetic demand data
synthetic_demand_data = create_synthetic_demand_data("2019-01-01", "2024-12-31", 10)

# Save to CSV
synthetic_demand_data.to_csv("synthetic_demand_data_2.csv", index=False)

# Display the first few rows and basic info
print(synthetic_demand_data.head())
print("\nDataset shape:", synthetic_demand_data.shape)
print("\nColumns:", synthetic_demand_data.columns.tolist())

# Display basic statistics
print("\nBasic statistics:")
print(synthetic_demand_data.describe())

        date product_id  demand  initial_stock  restock_amount  current_stock  \
1 2019-01-02       P001    85.0            110              10            0.0   
2 2019-01-03       P001    65.0            110              49            0.0   
3 2019-01-04       P001    57.0            110              41            0.0   
4 2019-01-05       P001    73.0            110              31            0.0   
5 2019-01-06       P001    61.0            110              38            0.0   

   economic_index  lagged_demand  moving_average  
1        0.379640           67.0       76.000000  
2       -0.556119           85.0       72.333333  
3       -0.130060           65.0       68.500000  
4        1.669070           57.0       69.400000  
5       -0.942558           73.0       68.000000  

Dataset shape: (21910, 9)

Columns: ['date', 'product_id', 'demand', 'initial_stock', 'restock_amount', 'current_stock', 'economic_index', 'lagged_demand', 'moving_average']

Basic statistics:
             

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Set the random seed for reproducibility
np.random.seed(42)

def create_synthetic_demand_data(start_date, end_date, num_products):
    date_range = pd.date_range(start=start_date, end=end_date)

    # Define holidays for demand boosts
    holidays = pd.to_datetime([
        '2024-01-01', '2024-01-26', '2024-03-25', '2024-08-15',
        '2024-10-02', '2024-11-12', '2024-12-25', '2023-01-01',
        '2023-01-26', '2023-03-22', '2023-08-15', '2023-10-02',
        '2023-11-12', '2023-12-25', '2022-01-01', '2022-01-26',
        '2022-03-01', '2022-08-15', '2022-10-02', '2022-11-12',
        '2022-12-25', '2021-01-01', '2021-01-26', '2021-03-11',
        '2021-08-15', '2021-10-02', '2021-11-12', '2021-12-25',
        '2020-01-01', '2020-01-26', '2020-03-02', '2020-08-15',
        '2020-10-02', '2020-11-12', '2020-12-25'
    ])

    all_data = []

    for product_id in range(1, num_products + 1):
        base_demand = 50 + 10 * product_id

        # Product-specific parameters
        reorder_point = base_demand * 3  # 3 days of average demand
        max_stock = base_demand * 7      # 7 days of average demand
        lead_time = np.random.randint(1, 4)  # 1-3 days lead time

        # Create arrays for each effect
        seasonal_effect = 20 * np.sin(2 * np.pi * np.array([d.dayofyear for d in date_range]) / 365)
        trend = 0.05 * np.array([(d.year - 2019) for d in date_range]) * base_demand
        price_effect = 5 * (np.random.rand(len(date_range)) > 0.5)
        noise = 10 * np.random.normal(0, 1, len(date_range))
        economic_index = np.random.normal(0, 1, len(date_range))

        # Initial demand calculation
        demand = base_demand + seasonal_effect + trend + price_effect + noise

        # Add holiday effects
        holiday_boost = np.zeros(len(date_range))
        for i, date in enumerate(date_range):
            if date in holidays:
                holiday_boost[i] = 30
        demand += holiday_boost

        # Ensure demand is non-negative
        demand = np.maximum(0, np.round(demand))

        # Initialize stock-related variables
        current_stock = np.zeros(len(date_range))
        restock_amount = np.zeros(len(date_range))
        stockout_days = np.zeros(len(date_range))
        pending_orders = np.zeros(len(date_range))

        # Initial stock
        current_stock[0] = np.random.randint(reorder_point, max_stock)

        # Calculate stock levels and restock amounts
        for i in range(len(date_range)):
            if i > 0:
                # Add any pending orders that arrived
                if i >= lead_time:
                    current_stock[i] = current_stock[i-1] + pending_orders[i-lead_time]
                else:
                    current_stock[i] = current_stock[i-1]

                # Subtract demand
                if current_stock[i] >= demand[i]:
                    current_stock[i] -= demand[i]
                else:
                    stockout_days[i] = 1
                    # Reduce demand if stock is insufficient
                    demand[i] = current_stock[i]
                    current_stock[i] = 0

                # Check if we need to reorder
                if current_stock[i] <= reorder_point and np.sum(pending_orders[max(0, i-lead_time):i+1]) == 0:
                    order_amount = max_stock - current_stock[i]
                    if i + lead_time < len(pending_orders):
                        pending_orders[i + lead_time] = order_amount
                        restock_amount[i] = order_amount

        # Create data for this product
        product_data = pd.DataFrame({
            'date': date_range,
            'product_id': f'P{product_id:03}',
            'demand': demand,
            'current_stock': current_stock,
            'restock_amount': restock_amount,
            'stockout_days': stockout_days,
            'economic_index': economic_index,
            'reorder_point': reorder_point,
            'lead_time': lead_time,
            'max_stock': max_stock
        })

        all_data.append(product_data)

    # Combine all product data
    synthetic_data = pd.concat(all_data, ignore_index=True)

    # Add time-based features
    synthetic_data['day_of_week'] = synthetic_data['date'].dt.dayofweek
    synthetic_data['month'] = synthetic_data['date'].dt.month
    synthetic_data['year'] = synthetic_data['date'].dt.year

    # Add lagged features
    for lag in [1, 7, 14, 28]:
        synthetic_data[f'demand_lag_{lag}'] = synthetic_data.groupby('product_id')['demand'].shift(lag)
        synthetic_data[f'stock_lag_{lag}'] = synthetic_data.groupby('product_id')['current_stock'].shift(lag)

    # Add rolling means
    for window in [7, 14, 28]:
        synthetic_data[f'demand_rolling_mean_{window}'] = synthetic_data.groupby('product_id')['demand'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        synthetic_data[f'stock_rolling_mean_{window}'] = synthetic_data.groupby('product_id')['current_stock'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())

    # Add stock-to-demand ratio
    synthetic_data['stock_demand_ratio'] = synthetic_data['current_stock'] / (synthetic_data['demand'].replace(0, 1))

    # Calculate days until stockout at current demand rate
    synthetic_data['days_until_stockout'] = synthetic_data['current_stock'] / (
        synthetic_data['demand_rolling_mean_7'].replace(0, 1))

    # Remove NaN values
    synthetic_data.dropna(inplace=True)

    return synthetic_data

# Create synthetic demand data
synthetic_demand_data = create_synthetic_demand_data("2019-01-01", "2024-12-31", 10)

# Save to CSV
synthetic_demand_data.to_csv("synthetic_demand_data_3.csv", index=False)

# Display information about the dataset
print("Dataset shape:", synthetic_demand_data.shape)
print("\nColumns:", synthetic_demand_data.columns.tolist())

# Display basic statistics for key columns
key_columns = ['demand', 'current_stock', 'restock_amount', 'stockout_days', 'stock_demand_ratio', 'days_until_stockout']
print("\nBasic statistics for key columns:")
print(synthetic_demand_data[key_columns].describe())

# Display the first few rows
print("\nFirst few rows of the dataset:")
print(synthetic_demand_data.head())

Dataset shape: (21640, 29)

Columns: ['date', 'product_id', 'demand', 'current_stock', 'restock_amount', 'stockout_days', 'economic_index', 'reorder_point', 'lead_time', 'max_stock', 'day_of_week', 'month', 'year', 'demand_lag_1', 'stock_lag_1', 'demand_lag_7', 'stock_lag_7', 'demand_lag_14', 'stock_lag_14', 'demand_lag_28', 'stock_lag_28', 'demand_rolling_mean_7', 'stock_rolling_mean_7', 'demand_rolling_mean_14', 'stock_rolling_mean_14', 'demand_rolling_mean_28', 'stock_rolling_mean_28', 'stock_demand_ratio', 'days_until_stockout']

Basic statistics for key columns:
             demand  current_stock  restock_amount  stockout_days  \
count  21640.000000   21640.000000    21640.000000   21640.000000   
mean     112.522782     418.408780      112.378928       0.089741   
std       47.248358     353.260998      227.742175       0.285817   
min        0.000000       0.000000        0.000000       0.000000   
25%       85.000000     172.000000        0.000000       0.000000   
50%      116

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

class SyntheticDemandGenerator:
    def __init__(self, start_date, end_date, num_products):
        self.start_date = pd.to_datetime(start_date)
        self.end_date = pd.to_datetime(end_date)
        self.num_products = num_products
        self.date_range = pd.date_range(start=start_date, end=end_date)

        # Define constants
        self.HOLIDAYS = self._generate_holidays()
        self.PEAK_SEASONS = self._generate_peak_seasons()
        self.PROMO_PERIODS = self._generate_promotional_periods()

        # Generate economic indicators for the entire date range
        self.economic_data = self._generate_economic_indicators()

    def _generate_holidays(self):
        # Define a list of holidays, can be expanded as needed
        holidays = [
            '01-01',  # New Year
            '12-25',  # Christmas
            '07-04',  # Independence Day
            '11-25',  # Thanksgiving (simplified)
            '02-14',  # Valentine's Day
            '10-31',  # Halloween
        ]
        holiday_dates = []
        years = range(self.start_date.year, self.end_date.year + 1)
        for year in years:
            for holiday in holidays:
                holiday_date = f"{year}-{holiday}"
                holiday_dates.append(pd.to_datetime(holiday_date))
        return pd.to_datetime(holiday_dates)

    def _generate_peak_seasons(self):
        peak_seasons = []
        years = range(self.start_date.year, self.end_date.year + 1)
        for year in years:
            # Summer season
            peak_seasons.append((f"{year}-06-01", f"{year}-08-31"))
            # Winter holiday season
            peak_seasons.append((f"{year}-11-15", f"{year}-12-31"))
        return [(pd.to_datetime(start), pd.to_datetime(end)) for start, end in peak_seasons]

    def _generate_promotional_periods(self):
        promo_periods = []
        years = range(self.start_date.year, self.end_date.year + 1)
        for year in years:
            # Generate 6 random promotional periods per year
            for _ in range(6):
                start = pd.Timestamp(f"{year}-01-01") + pd.Timedelta(days=np.random.randint(0, 330))
                duration = np.random.randint(5, 15)  # 5-14 day promotions
                end = start + pd.Timedelta(days=duration)
                promo_periods.append((start, end))
        return promo_periods

    def _generate_economic_indicators(self):
        # Generate base economic indicators
        unemployment = np.linspace(5, 8, len(self.date_range)) + np.random.normal(0, 0.5, len(self.date_range))
        consumer_confidence = np.linspace(90, 110, len(self.date_range)) + np.random.normal(0, 5, len(self.date_range))
        inflation_rate = np.linspace(2, 4, len(self.date_range)) + np.random.normal(0, 0.3, len(self.date_range))

        return pd.DataFrame({
            'date': self.date_range,
            'unemployment_rate': np.clip(unemployment, 3, 12),
            'consumer_confidence': np.clip(consumer_confidence, 60, 140),
            'inflation_rate': np.clip(inflation_rate, 0, 8)
        })

    def _is_date_in_periods(self, date, periods):
        return any(start <= date <= end for start, end in periods)

    def _handle_outliers(self, series, lower_percentile=1, upper_percentile=99):
        lower = np.percentile(series, lower_percentile)
        upper = np.percentile(series, upper_percentile)
        return np.clip(series, lower, upper)

    def generate_data(self):
        all_data = []

        for product_id in range(1, self.num_products + 1):
            product_data = self._generate_product_data(product_id)
            all_data.append(product_data)

        # Combine all product data
        synthetic_data = pd.concat(all_data, ignore_index=True)

        # Add economic indicators
        synthetic_data = synthetic_data.merge(self.economic_data, on='date')

        # Handle missing values
        synthetic_data = self._handle_missing_values(synthetic_data)

        # Add derived features
        synthetic_data = self._add_derived_features(synthetic_data)

        return synthetic_data

    def _generate_product_data(self, product_id):
        base_demand = 50 + 10 * product_id

        # Product-specific parameters
        reorder_point = base_demand * 3
        max_stock = base_demand * 7
        lead_time = np.random.randint(1, 4)

        # Initialize arrays
        demand = np.zeros(len(self.date_range))
        current_stock = np.zeros(len(self.date_range))
        restock_amount = np.zeros(len(self.date_range))
        stockout_days = np.zeros(len(self.date_range))
        pending_orders = np.zeros(len(self.date_range))
        promotion_effect = np.zeros(len(self.date_range))

        # Generate demand components
        seasonal_effect = 20 * np.sin(2 * np.pi * np.array([d.dayofyear for d in self.date_range]) / 365)
        trend = 0.05 * np.array([(d.year - self.start_date.year) for d in self.date_range]) * base_demand

        # Generate initial demand
        for i, date in enumerate(self.date_range):
            daily_demand = base_demand + seasonal_effect[i] + trend[i]

            # Add holiday effect
            if date in self.HOLIDAYS:
                daily_demand *= 1.5

            # Add peak season effect
            if self._is_date_in_periods(date, self.PEAK_SEASONS):
                daily_demand *= 1.3

            # Add promotion effect
            if self._is_date_in_periods(date, self.PROMO_PERIODS):
                promo_multiplier = np.random.uniform(1.2, 1.8)
                daily_demand *= promo_multiplier
                promotion_effect[i] = 1

            # Add noise
            daily_demand += np.random.normal(0, base_demand * 0.1)

            demand[i] = max(0, daily_demand)

        # Handle outliers
        demand = self._handle_outliers(demand)

        # Calculate stock levels
        current_stock[0] = np.random.randint(reorder_point, max_stock)
        for i in range(1, len(self.date_range)):
            # Add pending orders
            if i >= lead_time:
                current_stock[i] = current_stock[i-1] + pending_orders[i-lead_time]
            else:
                current_stock[i] = current_stock[i-1]

            # Handle demand
            if current_stock[i] >= demand[i]:
                current_stock[i] -= demand[i]
            else:
                stockout_days[i] = 1
                demand[i] = current_stock[i]
                current_stock[i] = 0

            # Reordering logic
            if current_stock[i] <= reorder_point and np.sum(pending_orders[max(0, i-lead_time):i+1]) == 0:
                order_amount = max_stock - current_stock[i]
                if i + lead_time < len(pending_orders):
                    pending_orders[i + lead_time] = order_amount
                    restock_amount[i] = order_amount

        return pd.DataFrame({
            'date': self.date_range,
            'product_id': f'P{product_id:03}',
            'demand': np.round(demand),
            'current_stock': np.round(current_stock),
            'restock_amount': np.round(restock_amount),
            'stockout_days': stockout_days,
            'is_promotion': promotion_effect,
            'is_holiday': [1 if d in self.HOLIDAYS else 0 for d in self.date_range],
            'is_peak_season': [1 if self._is_date_in_periods(d, self.PEAK_SEASONS) else 0 for d in self.date_range],
            'reorder_point': reorder_point,
            'lead_time': lead_time,
            'max_stock': max_stock
        })

    def _handle_missing_values(self, df):
        # Forward fill for stock-related columns
        stock_columns = ['current_stock', 'restock_amount']
        df[stock_columns] = df[stock_columns].fillna(method='ffill')

        # Use moving average for demand-related missing values
        demand_columns = ['demand']
        for col in demand_columns:
            df[col] = df.groupby('product_id')[col].transform(
                lambda x: x.fillna(x.rolling(window=7, min_periods=1).mean()))

        # Fill remaining NaNs with 0
        return df.fillna(0)

    def _add_derived_features(self, df):
        # Time-based features
        df['day_of_week'] = df['date'].dt.dayofweek
        df['month'] = df['date'].dt.month
        df['year'] = df['date'].dt.year
        df['quarter'] = df['date'].dt.quarter

        # Lagged features
        for lag in range(1, 4):  # Last 3 days
            df[f'demand_lag_{lag}'] = df.groupby('product_id')['demand'].shift(lag)

        # Rolling statistics
        df['demand_rolling_mean'] = df.groupby('product_id')['demand'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
        df['demand_rolling_std'] = df.groupby('product_id')['demand'].transform(lambda x: x.rolling(window=7, min_periods=1).std())

        return df

# Example usage
generator = SyntheticDemandGenerator(start_date='2023-01-01', end_date='2023-12-31', num_products=10)
synthetic_data = generator.generate_data()
print(synthetic_data.head())
synthetic_data.to_csv("synthetic_demand_data_4.csv", index=False)



        date product_id  demand  current_stock  restock_amount  stockout_days  \
0 2023-01-01       P001    78.0          340.0             0.0            0.0   
1 2023-01-02       P001    60.0          280.0             0.0            0.0   
2 2023-01-03       P001    56.0          223.0             0.0            0.0   
3 2023-01-04       P001    60.0          163.0           257.0            0.0   
4 2023-01-05       P001    58.0          105.0           315.0            0.0   

   is_promotion  is_holiday  is_peak_season  reorder_point  ...  \
0           0.0           1               0            180  ...   
1           0.0           0               0            180  ...   
2           0.0           0               0            180  ...   
3           0.0           0               0            180  ...   
4           0.0           0               0            180  ...   

   inflation_rate  day_of_week  month  year  quarter  demand_lag_1  \
0        2.034356            6      1  2

  df[stock_columns] = df[stock_columns].fillna(method='ffill')
