In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
customers_df = pd.read_csv('../Cleaned_Datasets/customers_sg.csv')
products = pd.read_csv('../Cleaned_Datasets/products.csv')

# **Step 1 : Data Cleaning for products dataset**

We want to clean the products dataset to make use of the products sold for synthetic data generation of our sales dataset.
*  Removed products with total_sold == 0: Ensure that products in sales dataset are sold before
*  Removed products with phrases 'free gift', 'not for sale', 'do not purchase', 'free gift with purchase' in their titles. This is to ensure that products used in our sales dataset are actual products purchased by customers. Based on observations,  this approach could also eliminate products with extreme pricing, as sellers often set the prices of these free gifts either excessively high or excessively low.



In [3]:
# Define the list of phrases to filter out in the title column
filter_phrases = ['do not purchase', 'free gift with purchase', 'not for sale', 'free gift', 'not for sell', 'gwp', 'gift with purchase']
pattern = '|'.join(filter_phrases)

# Filter the DataFrame to exclude rows with titles containing specified phrases and total_sold == 0
# Filter the DataFrame to exclude rows with specified phrases in 'title', 'total_sold' equal to 0, and 'price_actual' outside the 1 to 10000 range
products = products[
    ~products['title'].str.contains(pattern, case=False, na=False) &
    (products['total_sold'] != 0) &
    (products['price_actual'] >= 1) &
    (products['price_actual'] <= 10000)
]

# **Step 2 : Data preparation for customer dataset**

We create a new column in the customer dataframe to determine if the customer is a new customer or a returning customer so that we can use in our synthetic data generation for the sales dataset.

In [4]:
# Remove duplicated customer_id entries, keeping the first occurrence
customers_df = customers_df.drop_duplicates(subset='customer_id', keep='first').copy()

# Generate a column to determine if it is new or returning customers in the customer_df
customers_df.loc[:, 'target_audience'] = customers_df['last_checkout_day'].apply(
    lambda x: 'new customer' if x == 'Never checkout' else 'returning customer'
)

customers_df.to_csv('customer_data.csv', index=False)

# **Step 3 : Preparing synthetic dataset for marketing channels**

We will generate a dataset with the marketing channels information that simulates the marketing channels of Shopee. We limit our focus to the common Shopee marketing channels which are - Email, Social Media, In-App, Website, SMS and KOL.

In [5]:
np.random.seed(42)
num_rows_marketing = 100000
marketing_channels = ['Email', 'Social Media', 'In-App', 'Website', 'SMS', 'KOL']

# We want to have more rows during sales period, as higher marketing spend is likely observed

# Define Shopee sales dates
mega_sales_dates = pd.to_datetime(['2019-01-01', '2019-02-02', '2019-03-03', '2019-04-04', '2019-05-05',
                                    '2019-06-06', '2019-07-07', '2019-08-08', '2019-09-09', '2019-10-10',
                                    '2019-11-11', '2019-12-12'])

# Christmas, new year, cyber monday, black friday sales dates etc
seasonal_sales_dates = pd.to_datetime(['2019-01-01', '2019-02-05', '2019-04-19', '2019-05-01', '2019-06-05',
                                       '2019-06-07', '2019-08-09', '2019-08-11', '2019-10-06', '2019-10-27', '2019-11-29', '2019-12-02',
                                       '2019-12-25', '2019-12-31'])

high_sales = pd.to_datetime(['2019-11-11', '2019-12-12', '2019-11-29'])
medium_sales = pd.to_datetime(['2019-06-06', '2019-09-09'])

# Build up marketing starting 7 days before sales, with increasing priority as the sale day approaches
days_before_sales = []
for sale_date in high_sales:
    days_before_sales.extend([sale_date - pd.DateOffset(days=i) for i in range(1, 8)])
# days_before_sales = pd.DatetimeIndex(days_before_sales)

mid_month_days = pd.to_datetime(['2019-' + str(month).zfill(2) + '-15' for month in range(1, 13)] +
                                ['2019-' + str(month).zfill(2) + '-25' for month in range(1, 13)])


dates = pd.date_range(start='2019-01-01', end='2019-12-31', freq='D').to_list()

# Base probabilities (low for regular dates)
base_prob = 1 / len(dates)
probabilities = np.full(len(dates), base_prob)

# Adjust probabilities for sale days and periods
for i, date in enumerate(dates):
    if date in high_sales:
        # Higher priority for sales dates but slightly lower than pre-sale days
        probabilities[i] = 0.04
    elif date in days_before_sales:
        # Pre-sale marketing build-up for high-priority sales (1-7 days before)
        days_diff = (high_sales - date).days.min()
        if days_diff == 1:
            probabilities[i] = 0.06
        elif days_diff in [2, 3]:
            probabilities[i] = 0.04
        elif days_diff in [4, 5]:
            probabilities[i] = 0.03
        elif days_diff in [6, 7]:
            probabilities[i] = 0.015
    elif date in mid_month_days:
        probabilities[i] = 0.015
    elif date in seasonal_sales_dates:
        probabilities[i] = 0.05
    else:
        probabilities[i] = base_prob

# Normalize probabilities
probabilities /= probabilities.sum()
marketing_dates = np.random.choice(dates, size=num_rows_marketing, p=probabilities)

# Given Shopee spent an estimated US$500 million on marketing spend in SEA
total_shopee_spend = 500000000
sg_share = 0.04
sg_shopee_spend = total_shopee_spend * sg_share

# Simulate Shopee's marketing channel proportions
channel_proportions = {
    'Email': 0.12,
    'Social Media': 0.22,
    'In-App': 0.24,
    'KOL': 0.20,
    'Website': 0.17,
    'SMS': 0.05,
}

# We calculate total expenditure per channel based on proportions
total_expenditures = {channel: sg_shopee_spend * proportion for channel, proportion in channel_proportions.items()}

# Simulate for pre-sale days - higher for social media and KOL as these channels are likely to have more marketing pre-sale
pre_sale_channel_weights = {
    'Email': 0.08,
    'Social Media': 0.35,
    'In-App': 0.15,
    'Website': 0.12,
    'SMS': 0.05,
    'KOL': 0.25
}

# Simulate for sale days - higher for in-app
sale_day_channel_weights = {
    'Email': 0.07,
    'Social Media': 0.20,
    'In-App': 0.35,
    'Website': 0.15,
    'SMS': 0.05,
    'KOL': 0.18
}

channels = []
for date in marketing_dates:
    if date in high_sales:
        channel_weights = list(sale_day_channel_weights.values())
        chosen_channel = np.random.choice(marketing_channels, p=channel_weights)
    elif date in days_before_sales:
        channel_weights = list(pre_sale_channel_weights.values())
        chosen_channel = np.random.choice(marketing_channels, p=channel_weights)
    else:
        channel_weights = list(channel_proportions.values())
        chosen_channel = np.random.choice(marketing_channels, p=channel_weights)
    channels.append(chosen_channel)

channels

marketing_df = pd.DataFrame({'channel_name': channels})

# We want to calculate the total expenditure per channel
channel_counts = marketing_df['channel_name'].value_counts().to_dict()

# We want to spread out the expenditure per channel and add in some random variation for realism
def spread_expenditure(channel_name):
    if channel_name in channel_counts:
        current_expenditure = total_expenditures[channel_name] / channel_counts[channel_name]
        random_variation = np.random.uniform(-0.2, 0.2)  # Variation between -20% and +20%
        return round(current_expenditure * (1 + random_variation), 2)
    return 0

marketing_df['channel_expenditure'] = marketing_df['channel_name'].apply(spread_expenditure)

visitor_multipliers = {
    'Email': np.random.uniform(2, 4),
    'SMS': np.random.uniform(1.5, 3),
    'KOL': np.random.uniform(2, 6),
    'In-App': np.random.uniform(2, 5.5),
    'Social Media': np.random.uniform(1.2, 7),
    'Website': np.random.uniform(2, 4.2)
}

visitors = [
    np.round(marketing_df['channel_expenditure'][i] * visitor_multipliers[channels[i]], 0)
    for i in range(num_rows_marketing)
]

conversion_rates = {
    'Email': np.random.uniform(0.025, 0.04),
    'SMS': np.random.uniform(0.03, 0.035),
    'KOL': np.random.uniform(0.05, 0.08),
    'In-App': np.random.uniform(0.03, 0.06),
    'Social Media': np.random.uniform(0.02, 0.05),
    'Website': np.random.uniform(0.02, 0.04)
}

conversion_rate_list = np.array([
    conversion_rates[channels[i]] for i in range(num_rows_marketing)
])

# Calculate sales generated based on visitors and conversion rates
sales_generated = np.round(visitors * conversion_rate_list, 0)
sales_generated = np.where(sales_generated == 0, 1, sales_generated)
conversion_rate_calc = (sales_generated / visitors) * 100

# Average Order Value - overall for Shopee is around $15-$25
aov_list = [
    np.random.uniform(8, 25) if channel == 'Email' else
    np.random.uniform(6, 10) if channel == 'SMS' else
    np.random.uniform(15, 30) if channel == 'KOL' else
    np.random.uniform(10, 22) if channel == 'In-App' else
    np.random.uniform(10, 16) if channel == 'Social Media' else
    np.random.uniform(8, 20)
    for channel in channels
]

# Calculate Total Revenue based on sales and AOV, but ensure that revenue is > expenditure for realism
total_revenue = []

for i in range(num_rows_marketing):
    expenditure = marketing_df['channel_expenditure'].iloc[i]
    revenue = sales_generated[i] * aov_list[i]
    if revenue <= expenditure:
        revenue = expenditure * np.random.uniform(1.1, 1.3)

    total_revenue.append(round(revenue, 2))

# We want to simulate the CTR for each marketing channel based on average E-commerce CTR scenarios
click_through_rate = [
    np.random.uniform(0.02, 0.03) if channel == 'Email' else  # 2% to 3%
    np.random.uniform(0.01, 0.02) if channel == 'SMS' else    # 1% to 2%
    np.random.uniform(0.02, 0.04) if channel == 'KOL' else    # 2% to 4%
    np.random.uniform(0.005, 0.015) if channel == 'In-App' else  # 0.5% to 1.5%
    np.random.uniform(0.01, 0.04) if channel == 'Social Media' else  # 1% to 4%
    np.random.uniform(0.005, 0.01)  # 0.5% to 1% for Website Ads
    for channel in channels
]

# We want to simulate the Bounce rates for each marketing channel based on average E-commerce bounce rates scenarios
bounce_rate = [
    np.random.uniform(30, 40) if channel == 'Email' else       # 30-40%
    np.random.uniform(40, 50) if channel == 'SMS' else         # 40-50%
    np.random.uniform(35, 45) if channel == 'KOL' else         # 35-45%
    np.random.uniform(50, 60) if channel == 'In-App' else      # 50-60%
    np.random.uniform(50, 60) if channel == 'Social Media' else # 50-60%
    np.random.uniform(55, 65)  # 55-65% for Website ads
    for channel in channels
]

In [6]:
# Now we join the above columns to create our synthetic dataframe
marketing_df = pd.DataFrame({
    'date': marketing_dates,
    'channel_name': channels,
    'channel_expenditure': marketing_df['channel_expenditure'],
    'channel_visitors': visitors,
    'channel_sales': sales_generated,
    'conversion_rate': conversion_rate_calc,
    'average_order_value': aov_list,
    'total_revenue': total_revenue,
    'click_through_rate': click_through_rate,
    'bounce_rate': bounce_rate,
    #'customer_retention_rate': customer_retention_rate
})

print(marketing_df.head())
marketing_df['day'] = marketing_df['date'].dt.day_name()
marketing_df.to_csv('synthetic_marketing_data.csv', index=False)

        date  channel_name  channel_expenditure  channel_visitors  \
0 2019-06-07       Website               165.48             444.0   
1 2019-12-15        In-App               201.15             736.0   
2 2019-11-06  Social Media               157.74             999.0   
3 2019-09-18        In-App               200.18             732.0   
4 2019-03-13        In-App               180.24             659.0   

   channel_sales  conversion_rate  average_order_value  total_revenue  \
0           14.0         3.153153            16.701049         233.81   
1           30.0         4.076087            19.295899         578.88   
2           40.0         4.004004            11.439214         457.57   
3           29.0         3.961749            16.086238         466.50   
4           26.0         3.945372            16.697335         434.13   

   click_through_rate  bounce_rate  
0            0.009033    62.637806  
1            0.008315    55.581256  
2            0.017502    55.664460 

# **Step 4 : Preparing synthetic dataset for promotional campaigns**

In [7]:
np.random.seed(42)
num_rows = 100000

# Campaign Type Dataset
campaign_ids = [np.random.randint(1000000, 9999999) for _ in range(num_rows)]
campaign_types = ['Flash Sale', 'Seasonal Sales', 'Bundle promotions', 'Mega Sales', 'Livestream Exclusive', 'Next Day Delivery']
campaign_costs = np.random.randint(1000, 10000, size=num_rows)

# Unique session ids
unique_session_ids = np.random.choice(range(1, 999999), size=num_rows, replace=False)
session_ids = np.random.choice(unique_session_ids, size=num_rows, replace=False)

# Generate customer ids based on customer_id in the customer_df
customer_ids = np.random.choice(customers_df['customer_id'], size=num_rows, replace=True)

# Generate gender & age columns for the customer
gender = np.random.choice(['Male', 'Female'], size=num_rows)
years = ['0-17 years', '18-24 years', '25-34 years', '35-44 years', '45-54 years', '55-64 years', '65 years and older']
age_group_probabilities = [0.04, 0.20, 0.33, 0.25, 0.12, 0.06, 0.00]  #increase probability for '25-34 years'
age_group = np.random.choice(years, size=num_rows, p=age_group_probabilities)

# CTR
marketing_channel_probabilities = {
    'In-App': [0.2, 0.80],
    'Social Media': [0.1, 0.9],
    'KOL': [0.08, 0.92],
    'Email': [0.05, 0.95],
    'Website': [0.05, 0.95],
    'SMS': [0.02, 0.98]
}

# Simulate campaign data with marketing channels
channels = np.random.choice(list(marketing_channel_probabilities.keys()), size=num_rows)

# Generate is_click based on the probabilities for each channel
is_click = []
for channel in channels:
    click_prob = marketing_channel_probabilities[channel]
    is_click.append(np.random.choice([1, 0], p=click_prob))

# Convert to a NumPy array
is_click = np.array(is_click)

# Define key dates for Mega Sales and Seasonal Sales, increasing priority for 2019-11-11
mega_sales_dates = pd.to_datetime(['2019-01-01', '2019-02-02', '2019-03-03', '2019-04-04', '2019-05-05',
                                   '2019-06-06', '2019-07-07', '2019-08-08', '2019-09-09', '2019-10-10',
                                   '2019-11-11', '2019-12-12'])

seasonal_sales_dates = pd.to_datetime(['2019-01-01', '2019-02-05', '2019-04-19', '2019-05-01', '2019-06-05',
                                       '2019-06-07', '2019-08-09', '2019-08-11', '2019-10-06', '2019-10-27', '2019-11-29', '2019-12-02',
                                       '2019-12-25', '2019-12-31'])

# Initialize lists for start and end dates
start_dates = []
end_dates = []
campaign_type_choices = []

# Define available dates excluding mega_sales_dates and seasonal_sales_dates
excluded_dates = list(mega_sales_dates) + list(seasonal_sales_dates)
available_dates = pd.date_range('2019-01-01', '2019-12-31', freq='D').difference(excluded_dates)

# Adjust the loop to select from available_dates for non-special campaigns
for _ in range(num_rows):
    campaign_type = np.random.choice(campaign_types, p=[0.1, 0.1, 0.1, 0.4, 0.15, 0.15])  # Higher probability for Mega Sales
    campaign_type_choices.append(campaign_type)

    if campaign_type == 'Mega Sales':
        if np.random.rand() < 0.5:
            date = pd.Timestamp('2019-11-11')
        else:
            date = np.random.choice(mega_sales_dates)
        start_dates.append(date - pd.Timedelta(days=1))
        end_dates.append(date + pd.Timedelta(days=1))
    elif campaign_type == 'Seasonal Sales':
        index = np.random.randint(len(seasonal_sales_dates))
        start_dates.append(seasonal_sales_dates[index] - pd.Timedelta(days=1))
        end_dates.append(seasonal_sales_dates[index] + pd.Timedelta(days=1))
    else:
        # Select a random date from available_dates for other campaign types
        random_start = pd.Timestamp(np.random.choice(available_dates))
        start_dates.append(random_start)
        end_dates.append(random_start + pd.Timedelta(days=3)) # 3 days campaign

# Marketing channel, target audience, promotional type, and discount columns
marketing_channels = ['In-App', 'Social Media', 'KOL', 'Email', 'Website', 'SMS']
marketing_channel_prob = [0.4, 0.3, 0.2, 0.03, 0.05, 0.02]  # Higher probabilities for In-App, Social Media, and KOL

promotional_discount_types = np.random.choice(['Percentage', 'Fixed Amount'], size=num_rows)
promotional_discount_values = np.zeros(num_rows)

promotional_discount_values[promotional_discount_types == 'Percentage'] = np.random.randint(20, 61, size=(promotional_discount_types == 'Percentage').sum())
promotional_discount_values[promotional_discount_types == 'Fixed Amount'] = np.random.randint(2, 21, size=(promotional_discount_types == 'Fixed Amount').sum())

campaign_df = pd.DataFrame({
    'campaign_id': campaign_ids,
    'campaign_type': campaign_type_choices,
    'campaign_cost': campaign_costs,
    'session_id': session_ids,
    'start_date': start_dates,
    'end_date': end_dates,
    'marketing_channel': np.random.choice(marketing_channels, size=num_rows, p=marketing_channel_prob),
    'promotional_discount_type': promotional_discount_types,
    'promotional_discount_value': promotional_discount_values,
    'gender': gender,
    'age_group': age_group,
    'is_click': is_click,
    'customer_id': customer_ids
})

# Merge to get target_audience column
campaign_df = campaign_df.merge(customers_df[['customer_id', 'target_audience']], on='customer_id', how='left')
campaign_df['target_audience'].fillna('new customer', inplace=True)


print(campaign_df.head)
# Save the campaign DataFrame to CSV
campaign_df.to_csv('synthetic_campaign_data.csv', index=False)
len(campaign_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  campaign_df['target_audience'].fillna('new customer', inplace=True)


<bound method NDFrame.head of        campaign_id         campaign_type  campaign_cost  session_id  \
0          7423388     Next Day Delivery           2035       66967   
1          7550634            Flash Sale           9823      855986   
2          5304572            Mega Sales           9023      305478   
3          3234489     Bundle promotions           1448      428910   
4          8204212            Mega Sales           1520      993376   
...            ...                   ...            ...         ...   
99995      3965488            Mega Sales           6286      345093   
99996      5569161            Mega Sales           9400      771967   
99997      3412534     Next Day Delivery           5345      883913   
99998      4864695            Mega Sales           1427      196137   
99999      2692728  Livestream Exclusive           5375      969209   

      start_date   end_date marketing_channel promotional_discount_type  \
0     2019-01-12 2019-01-15               

100000

# **Step 5 : Preparing synthetic order & sales dataset**

In [8]:
np.random.seed(42)
num_rows = 100000

# Shopee Sales Dataset
target_categories = ['Home & Living', 'Health & Beauty', 'Mobile & Accessories']
category_sales_prob = products['main_category'].value_counts(normalize=True)
category_sales_prob.update(category_sales_prob[target_categories] * 2)
category_sales_prob = category_sales_prob / category_sales_prob.sum()
product_category_prob = products['main_category'].map(category_sales_prob)
product_category_prob = product_category_prob / product_category_prob.sum()

order_id = [np.random.randint(100000, 10000000) for _ in range(num_rows)]
unique_session_ids = campaign_df['session_id'].unique()
np.random.shuffle(unique_session_ids)
extended_session_ids = unique_session_ids[:100000]


duplicated_product_ids = np.random.choice(products['product_id'], size=num_rows, replace=True, p=product_category_prob)
duplicated_customer_ids = np.random.choice(campaign_df['customer_id'], size=num_rows, replace=True)
all_campaign_types = campaign_df['campaign_type'].unique().tolist() + ['Regular day']

# Define probabilities for each campaign type, including 'Regular day'
probabilities = [0.075, 0.075, 0.075, 0.4, 0.15, 0.15, 0.075]  # Ensure this matches the number of campaign types

duplicated_campaign_types = np.random.choice(all_campaign_types, size=num_rows, p=probabilities)# Set is_campaign based on the campaign type
is_campaign = np.where(duplicated_campaign_types == 'Regular day', 0, 1)

campaign_data = pd.DataFrame({
    'campaign_type': duplicated_campaign_types,
    'is_campaign': is_campaign
})

product_df = products[['product_id', 'main_category', 'price_actual', 'total_sold', 'title', 'Stock']].rename(columns={'price_actual': 'price'})

orders = pd.DataFrame({
    'order_id': order_id,
    'product_id': duplicated_product_ids,
    'session_id': extended_session_ids,
    'is_campaign': is_campaign,
    'campaign': duplicated_campaign_types
})

# Merge with product_df (product information)
orders = orders.merge(product_df, on='product_id', how='left')

# Merge with campaign_df (campaign information)
orders = orders.merge(campaign_df[['session_id','customer_id', 'campaign_type', 'promotional_discount_type', 'gender']], on='session_id', how='left')
orders['campaign'] = orders['campaign_type']
orders.drop(columns=['campaign_type'], inplace=True)

# Defining Price Bins and Labels
price_bins = [0, 20, 50, 100, 500, float('inf')]
price_labels = ['Low', 'Mid-Low', 'Mid', 'Mid-High', 'High']
orders['price_range'] = pd.cut(orders['price'], bins=price_bins, labels=price_labels)


high_priority_mega_sales_dates = ['2019-11-11', '2019-09-09', '2019-10-10', '2019-12-12']
additional_mega_sales_dates = [
    '2019-01-01', '2019-02-02', '2019-03-03', '2019-04-04', '2019-05-05',
    '2019-06-06', '2019-07-07', '2019-08-08'
]
other_seasonal_sales_dates = ['2019-01-01', '2019-02-05', '2019-04-19', '2019-05-01', '2019-06-05',
                                       '2019-06-07', '2019-08-09', '2019-08-11', '2019-10-06', '2019-10-27', '2019-12-02',
                                       '2019-12-25', '2019-12-31']

# Higher Sales on Black Friday
high_priority_seasonal_sales_dates = ['2019-11-29']
early_december_dates = pd.date_range('2019-12-01', '2019-12-24').to_list()

mega_sales_dates = pd.to_datetime(high_priority_mega_sales_dates + additional_mega_sales_dates)
seasonal_sales_dates = pd.to_datetime(high_priority_seasonal_sales_dates + early_december_dates)

exclude = list(mega_sales_dates) + list(seasonal_sales_dates)
regular_dates = pd.date_range('2019-01-01', '2019-12-31', freq='D').difference(exclude)

mega_priority_probs = [0.5 if date == '2019-11-11' else 0.2 if date in ['2019-09-09', '2019-10-10', '2019-12-12'] else 0.01 for date in mega_sales_dates]
seasonal_priority_probs = [0.3 if date == '2019-11-29' else 0.02 if date in other_seasonal_sales_dates else 0.4 / len(early_december_dates) for date in seasonal_sales_dates]
total_mega_prob = sum(mega_priority_probs)
mega_priority_probs = [p / total_mega_prob * 0.8 for p in mega_priority_probs]

total_seasonal_prob = sum(seasonal_priority_probs)
seasonal_priority_probs = [p / total_seasonal_prob * 0.2 for p in seasonal_priority_probs]

all_dates = list(mega_sales_dates) + list(seasonal_sales_dates) + list(regular_dates)
combined_probs = mega_priority_probs + seasonal_priority_probs + [0.01 / len(regular_dates)] * len(regular_dates)  # Small probability for regular dates

# Ensure probabilities sum to 1
combined_probs = np.array(combined_probs) / sum(combined_probs)

order_times = np.random.choice(all_dates, size=num_rows, p=combined_probs)
# Assign generated order times to the 'order_time' column in the 'orders' DataFrame
orders['order_time'] = order_times

orders['discount'] = 0

# Apply discounts for 'Percentage' type based on campaign type and order time
percentage_discount_conditions = (
    (orders['promotional_discount_type'] == 'Percentage') &
    (orders['is_campaign'] == 1)
)

for i in range(len(orders)):
    # Check if the row meets the percentage discount conditions
    if percentage_discount_conditions[i]:

        # Apply discount based on specific campaign conditions
        if orders.loc[i, 'order_time'] == '2019-11-11' and orders.loc[i, 'campaign'] == 'Mega Sales':
            orders.loc[i, 'discount'] = np.random.randint(20, 51)

        elif orders.loc[i, 'campaign'] == 'Flash Sale':
            orders.loc[i, 'discount'] = np.random.randint(15, 31)

        elif orders.loc[i, 'campaign'] == 'Seasonal Sales':
            orders.loc[i, 'discount'] = np.random.randint(20, 41)

        elif orders.loc[i, 'campaign'] == 'Bundle promotions':
            orders.loc[i, 'discount'] = np.random.randint(5, 16)

        elif orders.loc[i, 'campaign'] == 'Livestream Exclusive':
            orders.loc[i, 'discount'] = np.random.randint(25, 41)

        elif orders.loc[i, 'campaign'] == 'Next Day Delivery':
            orders.loc[i, 'discount'] = np.random.randint(0, 11)
        else:
            orders.loc[i, 'discount'] = 0
    else:
        # Set discount to 0 if no percentage discount condition is met
        orders.loc[i, 'discount'] = 0



# Apply discounts for 'Fixed Amount' type, ensuring discount is less than the product price
fixed_amount_discount_conditions = (
    (orders['promotional_discount_type'] == 'Fixed Amount') &
    (orders['is_campaign'] == 1)
)

# Generate discount values for Fixed Amount and ensure they do not exceed the product price
for idx in orders[fixed_amount_discount_conditions].index:
    max_discount = max(1, orders.loc[idx, 'price'] - 1)  # Ensure a minimum of 1 and max less than price
    orders.loc[idx, 'discount'] = np.random.randint(1, max_discount + 1)

print(orders.columns)

# Save the orders DataFrame to CSV
orders.to_csv('shopee_sales_df.csv', index=False)

Index(['order_id', 'product_id', 'session_id', 'is_campaign', 'campaign',
       'main_category', 'price', 'total_sold', 'title', 'Stock', 'customer_id',
       'promotional_discount_type', 'gender', 'price_range', 'order_time',
       'discount'],
      dtype='object')
