In [5]:
# Import necessary libraries
import pandas as pd                  # For working with tables (DataFrames)
import numpy as np                  # For numerical operations, arrays, random numbers
import holidays                     # For handling public holidays
import matplotlib.pyplot as plt     # For plotting charts
import seaborn as sns               # For enhanced chart visuals

# Set default plotting style
sns.set(style="whitegrid")

# --- Parameters ---
# Define the date range for data generation (2 full years)
start_date = '2023-01-01'
end_date = '2024-12-31'

# Generate a sequence of all dates between start and end
dates = pd.date_range(start=start_date, end=end_date)

# Total number of days in the period
n_days = len(dates)

# --- Product Categories and Items ---
# Define product categories with 5 example products per category (in Polish)
categories = {
    'Молочні продукти': [
        'Piątnica Mleko 3.2%', 'Mlekovita Ser żółty plasterki', 'President Masło extra',
        'Alpro Napój sojowy', 'Zott Jogurt naturalny'
    ],
    'Консерви': [
        'Łowicz Groszek konserwowy', 'Krakus Kukurydza konserwowa', 'Winogronki w occie',
        'Tarsmak Fasola czerwona', 'Cenos Tuńczyk kawałki w sosie własnym'
    ],
    'Іграшки': [
        'LEGO Classic 10717', 'Hasbro Monopoly', 'Playmobil City Life',
        'Smoby Traktor', 'Clementoni Puzzle 500'
    ],
    'Канцелярія': [
        'Stabilo Marker tekstowy', 'Pilot Długopis G2', 'Leitz Segregator A4',
        'Oxford Notes 100 kartek', 'Pentel Ołówek automatyczny'
    ],
    'Алкоголь': [
        'Żubrówka Bison Grass Vodka', 'Tyskie Piwo jasne', 'Cydr Lubelski',
        'Krakus Wino czerwone', 'Warka Radler cytryna'
    ],
    'Вода': [
        'Cisowianka Woda mineralna', 'Nałęczowianka Woda gazowana',
        'Żywiec Zdrój Naturalna', 'Evian Naturalna', 'Borjomi Woda mineralna'
    ],
    'Яйця': [
        'Jaja świeże klasy M', 'Jaja ekologiczne klasa L', 'Jaja wiejskie klasa S'
    ]
}

# --- Generate list of items ---
items = []
item_id = 1

# Add the original items to the list
for cat, names in categories.items():
    for name in names:
        items.append({'item_id': item_id, 'item_name': name, 'category': cat})
        item_id += 1

# If less than 650 items, generate additional synthetic versions by appending "Edycja N"
while len(items) < 650:
    for cat, names in categories.items():
        for name in names:
            if len(items) >= 650:
                break
            new_name = f"{name} Edycja {np.random.randint(1, 100)}"
            items.append({'item_id': item_id, 'item_name': new_name, 'category': cat})
            item_id += 1

# Convert to DataFrame (first 650 items)
items_df = pd.DataFrame(items[:650])

# --- Load Polish holidays for 2023 and 2024 ---
pl_holidays = holidays.Poland(years=[2023, 2024])

# --- Function to generate daily sales for a specific item ---
def sales_pattern_for_item(item_idx):
    # Define base daily sales level for each category
    base_level = {
        'Молочні продукти': 30,
        'Консерви': 20,
        'Іграшки': 10,
        'Канцелярія': 15,
        'Алкоголь': 12,
        'Вода': 25,
        'Яйця': 18
    }

    # Get category and base sales level for the current item
    cat = items_df.loc[item_idx, 'category']
    base = base_level.get(cat, 10)  # default if not found

    # Trend: gradual increase over time
    trend = np.linspace(base, base*1.3, n_days)

    # Yearly seasonality (e.g., milk sells more in summer, toys more in December)
    seasonality = 0.2 * base * np.sin(2 * np.pi * dates.dayofyear / 365.25 + item_idx)

    # Weekly pattern (e.g., more sales on weekends)
    week_season = 0.15 * base * np.sin(2 * np.pi * dates.dayofweek / 7 + item_idx)

    # Random noise to make data more realistic
    noise = np.random.normal(0, base*0.3, n_days)

    # Combine all components
    sales = trend + seasonality + week_season + noise

    # Holiday adjustment function
    def holiday_adj(date):
        # On public holidays, sales drop (e.g., stores may be closed)
        if date in pl_holidays:
            return 0.5
        # Days before a holiday may see increased sales
        for i in range(1, 4):
            if (date + pd.Timedelta(days=i)) in pl_holidays:
                return 1 + 0.3 * (4 - i)  # up to +0.9 three days before a holiday
        return 1.0  # no adjustment

    # Apply holiday adjustments to each date
    adjustments = np.array([holiday_adj(d) for d in dates])
    sales *= adjustments

    # Clip negative values to zero
    sales = np.clip(sales, 0, None)

    # Convert to integers
    sales = sales.astype(int)

    return sales

# --- Generate complete sales dataset ---
all_data = []
for idx in range(len(items_df)):
    # Generate daily sales for one item
    sales_qty = sales_pattern_for_item(idx)

    # Create a DataFrame with all sales data for that item
    item_data = pd.DataFrame({
        'item_id': items_df.loc[idx, 'item_id'],
        'item_name': items_df.loc[idx, 'item_name'],
        'category': items_df.loc[idx, 'category'],
        'date': dates,
        'sales_qty': sales_qty
    })

    # Add to the list of all items
    all_data.append(item_data)

# Combine data for all items into one large DataFrame
df_sales = pd.concat(all_data, ignore_index=True)

# --- Basic analytics and checks ---
print("=== DataFrame Info ===")
# Print general info about the dataframe: columns, datatypes, memory usage
print(df_sales.info())

print("\n=== Descriptive Statistics for Sales ===")
# Print summary statistics for the 'sales_qty' column
print(df_sales['sales_qty'].describe())

print("\n=== Unique Items ===", df_sales['item_id'].nunique())
# Count number of unique items

print("=== Unique Dates ===", df_sales['date'].nunique())
# Count number of unique dates (should match number of days in 2 years)

# --- Export dataset to CSV file ---
df_sales.to_csv('sales_data.csv', index=False)


=== Інформація про датафрейм ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 475150 entries, 0 to 475149
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   item_id    475150 non-null  int64         
 1   item_name  475150 non-null  object        
 2   category   475150 non-null  object        
 3   date       475150 non-null  datetime64[ns]
 4   sales_qty  475150 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 18.1+ MB
None

=== Описова статистика по продажах ===
count    475150.000000
mean         21.605139
std          12.049867
min           0.000000
25%          13.000000
50%          19.000000
75%          28.000000
max         134.000000
Name: sales_qty, dtype: float64

=== Унікальні товари === 650
=== Унікальні дати === 731
