In [30]:
import pandas as pd
import numpy as np
from faker import Faker
import random

def generate_data(num_entries):
    fake = Faker()
    Faker.seed(0)
    np.random.seed(0)  # For reproducibility

    # Possible categories and their products
    categories = {
        'Fruits': ['Apple', 'Banana', 'Orange', 'Grapes', 'Cherry'],
        'Vegetables': ['Potato', 'Tomato', 'Carrot', 'Cucumber', 'Pepper'],
        'Beverages': ['Soda', 'Juice', 'Water', 'Beer', 'Coffee'],
        'Dairy': ['Milk', 'Cheese', 'Butter', 'Yogurt', 'Cream']
    }

    data = []
    for _ in range(num_entries):
        category = random.choice(list(categories.keys()))
        item_name = random.choice(categories[category])
        item_id = fake.unique.random_int(min=1, max=15000)
        date = fake.date_between(start_date='-1y', end_date='today').strftime('%Y-%m-%d')
        last_month_sales = np.random.randint(100, 500)
        capacity = np.random.randint(200, 1000)
        last_year_sales = np.random.randint(1000, 5000)
        temperature = np.random.randint(-5, 30)
        weather = random.choice(['Sunny', 'Rainy', 'Cloudy', 'Snowy', 'Windy'])
        city =np.random.randint(1, 4)
        promotion = random.choice([1,0])
        predicted_sales = int(last_month_sales * (1 + np.random.normal(0, 0.1)))

        data.append({
            'Item_ID': item_id,
            'Item_Name': item_name,
            'Category': category,
            'Date': date,
            'Last_Month_Sales': last_month_sales,
            'Capacity': capacity,
            'Last_Year_Sales': last_year_sales,
            #'Temperature': temperature,
            'Weather': weather,
            'Predicted_Sales': predicted_sales,
            'City_Tier' : city,
            'Outlet' : np.random.randint(1, 4),
            'On_promotion' : promotion
        })

    # Create DataFrame
    df = pd.DataFrame(data)
    df.to_csv('supermarket_data.csv', index=False)
    return df

# Generate the dataset
generate_data(14500)

Unnamed: 0,Item_ID,Item_Name,Category,Date,Last_Month_Sales,Capacity,Last_Year_Sales,Weather,Predicted_Sales,City_Tier,Outlet,On_promotion
0,13836,Cream,Dairy,2023-09-26,272,759,2653,Rainy,233,2,3,1
1,12419,Carrot,Vegetables,2024-04-09,188,596,3362,Cloudy,216,3,3,0
2,6891,Beer,Beverages,2023-05-15,188,877,1537,Cloudy,197,1,1,1
3,4243,Juice,Beverages,2023-11-14,438,299,3008,Sunny,463,3,1,0
4,7962,Grapes,Fruits,2023-10-04,365,897,1639,Snowy,307,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14495,7432,Cheese,Dairy,2023-08-30,149,285,3446,Rainy,131,1,3,1
14496,2765,Butter,Dairy,2023-06-13,458,275,1555,Snowy,432,3,3,0
14497,13319,Water,Beverages,2023-12-17,268,799,2353,Sunny,259,3,2,1
14498,5078,Pepper,Vegetables,2023-09-24,118,463,2955,Snowy,94,3,1,1


In [14]:
ecommerce_store_products = {
    "Electronics": {
        "Smartphones": ["iPhone 13", "Samsung Galaxy S21", "Google Pixel 6"],
        "Laptops": ["MacBook Pro", "Dell XPS 13", "HP Spectre x360"],
        "Headphones": ["Sony WH-1000XM4", "Bose QuietComfort 45", "AirPods Pro"],
        "Cameras": ["Canon EOS R5", "Nikon D850", "Sony Alpha A7R IV"]
    },
    "Fashion": {
        "Clothing": ["Levi's Jeans", "Zara Top", "H&M Dress"],
        "Shoes": ["Nike Air Max", "Adidas Ultraboost", "Converse Chuck Taylor"],
        "Accessories": ["Ray-Ban Sunglasses", "Rolex Watch", "Gucci Bag"]
    },
    "Home and Garden": {
        "Furniture": ["IKEA Sofa", "West Elm Coffee Table", "Wayfair Bookshelf"],
        "Decor": ["Anthropologie Vase", "Pottery Barn Curtains", "Zara Home Candlesticks"],
        "Gardening Tools": ["Fiskars Lopper", "Spear & Jackson Spade", "Husqvarna Chainsaw"],
        "Kitchenware": ["Le Creuset Cookware Set", "Vitamix Blender", "KitchenAid Mixer"]
    },
    "Beauty and Health": {
        "Cosmetics": ["MAC Lipstick", "NARS Foundation", "Urban Decay Eyeshadow Palette"],
        "Skincare Products": ["The Ordinary Serum", "CeraVe Moisturizing Cream", "La Mer Moisturizer"],
        "Supplements": ["Vitamin D3", "Omega-3 Fish Oil", "Whey Protein"],
        "Personal Care": ["Philips Electric Shaver", "Oral-B Electric Toothbrush", "Dyson Hair Dryer"]
    },
    "Sports and Outdoors": {
        "Sporting Goods": ["Wilson Tennis Racket", "Spalding Basketball", "TaylorMade Golf Clubs"],
        "Outdoor Gear": ["The North Face Tent", "Osprey Backpack", "Yeti Cooler"],
        "Fitness Equipment": ["Bowflex Home Gym", "Peloton Bike", "Manduka Yoga Mat"]
    },
    "Toys and Games": {
        "Video Games": ["PlayStation 5", "Xbox Series X", "Nintendo Switch"],
        "Board Games": ["Catan", "Risk", "Monopoly"],
        "Toys": ["LEGO Star Wars Set", "Barbie Dreamhouse", "Hot Wheels Track Set"]
    }
}


In [21]:
import pandas as pd
import numpy as np
from faker import Faker
import random

def generate_convenience_store_data_for_regression(num_entries):
    
    fake = Faker()
    Faker.seed(0)
    np.random.seed(42)
    

    items_by_category = {
        'Snacks': ['Chips', 'Candy Bar', 'Pretzels', 'Nuts', 'Popcorn'],
        'Beverages': ['Soda', 'Water', 'Juice', 'Energy Drink', 'Coffee'],
        'Personal Care': ['Shampoo', 'Toothpaste', 'Soap', 'Deodorant', 'Lotion'],
        'Household Items': ['Paper Towels', 'Laundry Detergent', 'Garbage Bags', 'Dish Soap', 'Toilet Paper'],
        'Fresh Food': ['Sandwich', 'Salad', 'Fruit Cup', 'Yogurt', 'Sushi'],
        'Dairy' : ['Milk', 'Cheese', 'Butter', 'Yogurt', 'Cream','Ice Cream', 'Buttermilk', 'Sour Cream']
    }
    
    data = []
    for _ in range(num_entries):
        #category = random.choice(['Snacks', 'Beverages', 'Personal Care', 'Household Items', 'Fresh Food', 'Dairy'])
        #item_name = random.choice(items_by_category[category])
        #item_id = fake.unique.random_int(min=1, max=20000)
        #date = fake.date_between(start_date='-1y', end_date='today').strftime('%Y-%m-%d')
        #last_month_sales = np.random.randint(100, 500)
        #capacity = np.random.randint(200, 1000)
        #last_year_sales = np.random.randint(1000, 5000)
        #weather = random.choice(['Sunny', 'Rainy', 'Cloudy', 'Snowy', 'Windy'])
        city =np.random.randint(1, 4)
        promotion = random.choice([1,0])
        #predicted_sales = int(last_month_sales * (1 + np.random.normal(0, 0.1)))

        data.append({
            #'Item_ID': item_id,
            #'Item_Name': item_name,
            #'Category': category,
            #'Date': date,
            #'Last_Month_Sales': last_month_sales,
            #'Capacity': capacity,
            #'Last_Year_Sales': last_year_sales,
            #'Weather': weather,
            #'Predicted_Sales': predicted_sales,
            'City_Tier' : city,
            'On_promotion' : promotion,
            'Outlet' : np.random.randint(1, 4)
        })

    return pd.DataFrame(data)

# Generate synthetic data
convenience_store_regression_data = generate_convenience_store_data_for_regression(13000)
convenience_store_regression_data

Unnamed: 0,City_Tier,On_promotion,Outlet
0,3,1,1
1,3,1,3
2,1,1,1
3,3,0,2
4,3,1,3
...,...,...,...
12995,3,0,2
12996,3,0,1
12997,2,0,1
12998,3,1,3


In [27]:
convenience_store_regression_data.to_csv('outlet3.csv')

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('Grocery_dairy_prod.csv')

In [4]:
data

Unnamed: 0,Item_ID,Item_Name,Category,Date,Last_Month_Sales,Capacity,Last_Year_Sales,Temperature,Weather,Predicted_Sales,City_Tier,On_promotion,Outlet
0,13,Ghee,Dairy,12-21-2019,82,109,1077,33,Autumn,81,3,0,1
1,15,Cheese spread,Dairy,9-29-2023,71,99,1068,32,Winter,68,3,1,3
2,11,Whipped Cream,Dairy,11-04-2022,71,99,1068,38,Sunny,70,1,1,1
3,27,Milk,Dairy,02-07-2021,82,109,1077,48,Autumn,78,3,1,2
4,40,Cream,Dairy,09-11-2021,82,109,1077,36,Autumn,78,3,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,32,Butter spread,Dairy,11-02-2022,82,99,1068,46,Autumn,69,1,1,2
2995,43,Buttermilk,Dairy,12-06-2019,90,99,1068,40,Sunny,70,1,1,1
2996,40,Cream,Dairy,7-26-2019,71,99,1068,33,Rainy,70,3,1,3
2997,45,Whey,Dairy,01-01-2023,23,99,1068,50,Autumn,68,1,0,1


In [5]:
# Create a function to map unique values from one column to a new column with unique numbers
def map_unique_values(df, source_col, new_col):
    # Extract unique values and create a mapping to unique numbers
    unique_values = df[source_col].unique()
    unique_mapping = {value: idx for idx, value in enumerate(unique_values, 1)}  # Start indexing from 1

    # Apply the mapping to create a new column
    df[new_col] = df[source_col].map(unique_mapping)
    return df

# Apply the function to the new_item_data dataframe
mapped_data = map_unique_values(data.copy(), 'Item_Name', 'Item_ID')
mapped_data.head(), mapped_data[['Item_Name', 'Item_ID']].drop_duplicates().head()


(   Item_ID      Item_Name Category        Date  Last_Month_Sales  Capacity  \
 0        1           Ghee    Dairy  12-21-2019                82       109   
 1        2  Cheese spread    Dairy   9-29-2023                71        99   
 2        3  Whipped Cream    Dairy  11-04-2022                71        99   
 3        4           Milk    Dairy  02-07-2021                82       109   
 4        5          Cream    Dairy  09-11-2021                82       109   
 
    Last_Year_Sales  Temperature Weather  Predicted_Sales  City_Tier  \
 0             1077           33  Autumn               81          3   
 1             1068           32  Winter               68          3   
 2             1068           38   Sunny               70          1   
 3             1077           48  Autumn               78          3   
 4             1077           36  Autumn               78          3   
 
    On_promotion  Outlet  
 0             0       1  
 1             1       3  
 2       

In [6]:
mapped_data

Unnamed: 0,Item_ID,Item_Name,Category,Date,Last_Month_Sales,Capacity,Last_Year_Sales,Temperature,Weather,Predicted_Sales,City_Tier,On_promotion,Outlet
0,1,Ghee,Dairy,12-21-2019,82,109,1077,33,Autumn,81,3,0,1
1,2,Cheese spread,Dairy,9-29-2023,71,99,1068,32,Winter,68,3,1,3
2,3,Whipped Cream,Dairy,11-04-2022,71,99,1068,38,Sunny,70,1,1,1
3,4,Milk,Dairy,02-07-2021,82,109,1077,48,Autumn,78,3,1,2
4,5,Cream,Dairy,09-11-2021,82,109,1077,36,Autumn,78,3,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,27,Butter spread,Dairy,11-02-2022,82,99,1068,46,Autumn,69,1,1,2
2995,19,Buttermilk,Dairy,12-06-2019,90,99,1068,40,Sunny,70,1,1,1
2996,5,Cream,Dairy,7-26-2019,71,99,1068,33,Rainy,70,3,1,3
2997,18,Whey,Dairy,01-01-2023,23,99,1068,50,Autumn,68,1,0,1


In [42]:
mapped_data.to_csv('grocery_data.csv')

In [5]:
df

Unnamed: 0,Item_ID,Item_Name,Category,Date,Last_Month_Sales,Capacity,Last_Year_Sales,Weather,Predicted_Sales,City_Tier,On_promotion
0,18,Nuts,Snacks,25-04-2023,338,707,5345,Cloudy,370,3,1
1,13,Ice Cream,Dairy,25-04-2023,242,398,5988,Snowy,282,2,0
2,8,Deodorant,Personal Care,25-04-2023,383,992,3119,Rainy,406,2,1
3,23,Sandwich,Fresh Food,25-04-2023,137,652,2470,Windy,128,2,0
4,15,Laundry Detergent,Household Items,25-04-2023,299,298,4586,Snowy,259,2,1
...,...,...,...,...,...,...,...,...,...,...,...
19495,10,Energy Drink,Beverages,24-04-2024,153,892,3591,Snowy,125,2,0
19496,4,Cheese,Dairy,24-04-2024,378,941,7587,Snowy,367,2,1
19497,8,Deodorant,Personal Care,24-04-2024,220,759,5417,Rainy,218,1,0
19498,18,Nuts,Snacks,24-04-2024,173,424,4934,Cloudy,170,1,1


In [8]:
df['Item_Name'].value_counts()

Item_Name
Butter           526
Cream            508
Yogurt           507
Milk             499
Buttermilk       497
Cheese           463
Headphones       217
Smartphone       213
Mixer            211
T-shirt          211
Dress            209
Cutting Board    209
Smartwatch       205
Laptop           203
Cap              198
Dish Rack        198
Socks            192
Camera           189
Knife Set        186
Jeans            180
Hand Blender     179
Name: count, dtype: int64

In [35]:
data = pd.read_csv('Grocery_dairy_prod.csv')

In [36]:
from datetime import datetime
import numpy as np
import random

# Function to generate random dates within the provided range
def generate_random_dates(start_date, end_date, n):
    start_timestamp = pd.to_datetime(start_date).timestamp()
    end_timestamp = pd.to_datetime(end_date).timestamp()
    random_timestamps = np.random.uniform(start_timestamp, end_timestamp, n)
    random_dates = pd.to_datetime(random_timestamps, unit='s').date
    return random_dates


new_items = {
    'Dairy': ['Milk', 'Cheese', 'Butter', 'Yogurt', 'Cream','Ice Cream', 'Buttermilk', 'Sour Cream'],
    'Fruits': ['Dragon Fruit', 'Kiwi', 'Pomegranate'],
    'Pulses': ['Lentils', 'Chickpeas', 'Black Beans'],
    'Misc': ['Cereal', 'Pasta', 'Olive Oil']
}
# Flatten the list and ensure each category is equally represented
item_names = [item for category in new_items.values() for item in category]
category_labels = [category for category, items in new_items.items() for item in items]


# Determine the range of dates in the dataset
date_range_start = pd.to_datetime(data['Date']).min()
date_range_end = pd.to_datetime(data['Date']).max()

promotion = random.choice([1,0])

# Generate new rows
np.random.seed(42)  # For reproducibility
new_rows = pd.DataFrame({
    'Item_ID': np.random.randint(1, 51, size=10000),  # Assume Item_IDs range from 1 to 50
    'Item_Name': np.random.choice(item_names, size=10000),
    'Category': np.random.choice(category_labels, size=10000),
    'Category': ['Dairy'] * 10000,
    'Date': generate_random_dates(date_range_start, date_range_end, 10000),
    'Last_Month_Sales': np.random.normal(data['Last_Month_Sales'].mean(), data['Last_Month_Sales'].std(), 10000).astype(int),
    'Capacity': np.random.choice(data['Capacity'], size=10000),
    'Last_Year_Sales': np.random.normal(data['Last_Year_Sales'].mean(), data['Last_Year_Sales'].std(), 10000).astype(int),
    #'Temperature': np.random.normal(data['Temperature'].mean(), data['Temperature'].std(), 10000).astype(int),
    'Weather': np.random.choice(data['Weather'], size=10000),
    'Predicted_Sales': np.random.normal(data['Predicted_Sales'].mean(), data['Predicted_Sales'].std(), 10000).astype(int),
    'City_Tier' :  np.random.randint(1, 4, size=10000),
    'On_promotion' : promotion,
    'Outlet':  np.random.randint(1, 4, size=10000)
})

# Combine the original data with the new rows
extended_data = pd.concat([data, new_rows], ignore_index=True)
#extended_data.describe(include='all')


In [37]:
new_rows

Unnamed: 0,Item_ID,Item_Name,Category,Date,Last_Month_Sales,Capacity,Last_Year_Sales,Weather,Predicted_Sales,City_Tier,On_promotion,Outlet
0,39,Lentils,Dairy,2022-08-10,62,109,1066,Winter,57,3,0,2
1,29,Ice Cream,Dairy,2020-05-30,66,109,1063,Rainy,81,1,0,3
2,15,Cream,Dairy,2019-02-11,86,99,1070,Rainy,57,2,0,1
3,43,Dragon Fruit,Dairy,2019-10-23,87,99,1072,Rainy,79,1,0,3
4,8,Pasta,Dairy,2020-09-29,88,99,1067,Sunny,68,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,26,Cereal,Dairy,2021-09-06,84,109,1081,Autumn,75,3,0,2
9996,22,Pasta,Dairy,2022-12-16,81,99,1069,Rainy,55,3,0,1
9997,31,Yogurt,Dairy,2019-08-23,80,99,1076,Winter,79,2,0,3
9998,43,Lentils,Dairy,2023-02-03,84,99,1058,Rainy,70,1,0,1


In [38]:
new_rows.to_csv('grocery_data_test.csv')