In [1]:
from generator import generate_synthetic_data
import logging
import sys
import json 
import pickle
import random

### Setup Logging

In [2]:
# set logging
logging.getLogger().handlers.clear()
logging.basicConfig(
    level=logging.DEBUG,  # Set to DEBUG
    format='%(levelname)s: %(message)s',
    stream=sys.stdout,
    force=True
)
logging.getLogger().setLevel(logging.WARNING)

### Load JSON and Additional Required Data

In [18]:
data_path = "../../synthetic_data/json/all_full.json"
with open(data_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)

with open("../../synthetic_data/noise_toc.pkl", "rb") as f:
    toc_noise = pickle.load(f)

### Generate

In [19]:
# Randomness parameters for synthetic data generation
subchapters_random_noise_param = [.1, .2, .3]
chapters_random_noise_param = [.1, .3]
chapters_systemic_noise_param = [.1, .1, .3]
add_subchapters_param = [.2, .3]

# parameters for the number of iterations
n_iterations = 3
synthetic_data = []

for chapters_random_noise in chapters_random_noise_param:
    for subchapters_random_noise in subchapters_random_noise_param:
        for chapters_systemic_noise in chapters_systemic_noise_param:
            for add_subchapters in add_subchapters_param:
                randomness_parameters = {
                    "subchapter_random_noise": subchapters_random_noise,
                    "chapter_random_noise": chapters_random_noise,
                    "chapter_systemic_noise": chapters_systemic_noise,
                    "add_subchapter": add_subchapters
                }
                logging.info(f"Using TOC noise: {toc_noise}")
                # Generate synthetic data for the current noise configuration
                for iteration in range(n_iterations):
                    logging.info(f"Iteration {iteration + 1} of {n_iterations}")
                    # Generate synthetic data
                    synthetic_data.extend(generate_synthetic_data(
                        json_data=json_data,
                        toc_noise=toc_noise,
                        **randomness_parameters
                    ))
                    

In [7]:
for i in synthetic_data[220]:
    print(i)

1 Digital Marketing for E-commerce   1
1.1 Influencer Partnership Programs  1
1.2 Cross-Channel Marketing Integration 2
1.3 Retargeting and Remarketing 3
1.4 Product Photography and Videography   4
1.5 Content Marketing for E-commerce 5
 1.6 Social Media Marketing Integration  6
1.7 Conversion Rate Optimization  8
 1.8 Affiliate Marketing Networks 10
reengineering Process optimization strategies
 1.9 Search Engine Optimization for Products 11
1.10 User-Generated Content Campaigns 12
1.11 Email Marketing Automation 13
5635
1.12 Marketing Analytics and Attribution 14
1.13 Pay-Per-Click Advertising Strategies 15
2 Marketplace and Multi-Vendor Platforms 18
2.1 Dispute Resolution Mechanisms 18
2.2 Seller Performance Monitoring  19
2.3 Category Management Strategies 20
2.4 Unified Customer Experience  21
2.5 Multi-Vendor Inventory Management   22
2.6 Competitive Marketplace Strategies 26
2.7 International Marketplace Expansion 29
2.8 Commission and Fee Structure   31
 2.9 Product Quality Con

### Final Processing and Storage

In [20]:
len(synthetic_data)

4536

In [21]:
data_full= synthetic_data   # this is because I ran independently the generator for 
                            # the full data and the chapters data
#data_chapters = synthetic_data

In [22]:
all_data = data_full + data_chapters
n_data_to_keep = 15000

final_data = random.sample(all_data, n_data_to_keep)

In [23]:
# Save the generated data
output_path = f"../../synthetic_data/training_dataset/synthetic_toc.pkl"
with open(output_path, "wb") as f:
    pickle.dump(final_data, f)

logging.info(f"Saved synthetic data to {output_path}")
