In [1]:
"""
Generate Training Dataset
=========================

Goal: Create 50 synthetic messy files for ML training

Process:
1. Load clean datasets
2. Run generator on each
3. Save messy files + labels
4. Create train/validation/test splits
"""

import pandas as pd
import sys
sys.path.append('../ml_pipeline')

from data.synthetic_data_generator import SyntheticDataGenerator
import json
from pathlib import Path

print("âœ… Generator imported")

âœ… Generator imported


In [2]:
"""
We'll use seaborn's built-in datasets as "clean" starting points
Then make them messy!
"""

import seaborn as sns

# Load multiple clean datasets
datasets = {
    'titanic': sns.load_dataset('titanic'),
    'tips': sns.load_dataset('tips'),
    'iris': sns.load_dataset('iris'),
    'diamonds': sns.load_dataset('diamonds').sample(200),  # Sample to keep small
    'planets': sns.load_dataset('planets'),
}

print("Clean datasets loaded:")
for name, df in datasets.items():
    print(f"  - {name}: {df.shape}")

Clean datasets loaded:
  - titanic: (891, 15)
  - tips: (244, 7)
  - iris: (150, 5)
  - diamonds: (200, 10)
  - planets: (1035, 6)


In [3]:
"""
For each clean dataset, generate multiple messy versions
with different problem combinations
"""

# Create output directories
Path('../data/synthetic/messy').mkdir(parents=True, exist_ok=True)
Path('../data/synthetic/labels').mkdir(parents=True, exist_ok=True)

# Generate messy datasets
all_labels = []
file_counter = 0

for dataset_name, clean_df in datasets.items():
    print(f"\n{'='*60}")
    print(f"Processing: {dataset_name}")
    print(f"{'='*60}")
    
    # Generate 3 different messy versions of each dataset
    # (different random seeds = different problems)
    for version in range(3):
        file_counter += 1
        
        # Initialize generator with different seed
        generator = SyntheticDataGenerator(clean_df, seed=42 + version)
        
        # Generate messy data
        messy_df, labels = generator.generate()
        
        # Save messy CSV
        filename = f"messy_{file_counter:03d}_{dataset_name}_v{version}.csv"
        filepath = f'../data/synthetic/messy/{filename}'
        messy_df.to_csv(filepath, index=False)
        
        # Update labels
        labels['filename'] = filename
        labels['source_dataset'] = dataset_name
        labels['version'] = version
        
        # Save labels JSON
        label_filepath = f'../data/synthetic/labels/{filename.replace(".csv", ".json")}'
        with open(label_filepath, 'w') as f:
            json.dump(labels, f, indent=2)
        
        all_labels.append(labels)
        
        print(f"  âœ… Generated: {filename}")

print(f"\n{'='*60}")
print(f"âœ… Total files generated: {file_counter}")
print(f"{'='*60}")


Processing: titanic

ðŸ”§ Generating synthetic messy data...

âœ… Injected 89 duplicate rows (10.0%)
âœ… Created 1858 missing values (12.6% of cells)
âœ… Created 49 fuzzy duplicates in ['sex', 'embarked']
âœ… Added 3 outliers to column 'survived'
âœ… Added 3 outliers to column 'pclass'
âœ… Converted column 'survived' to string (type issue)

âœ… Generation complete!
   Original: (891, 15)
   Messy: (980, 15)
   Problems: 6 types injected
  âœ… Generated: messy_001_titanic_v0.csv

ðŸ”§ Generating synthetic messy data...

âœ… Injected 89 duplicate rows (10.0%)
âœ… Created 1876 missing values (12.8% of cells)
âœ… Created 49 fuzzy duplicates in ['sex', 'embarked']
âœ… Added 3 outliers to column 'survived'
âœ… Added 3 outliers to column 'pclass'
âœ… Converted column 'survived' to string (type issue)

âœ… Generation complete!
   Original: (891, 15)
   Messy: (980, 15)
   Problems: 6 types injected
  âœ… Generated: messy_002_titanic_v1.csv

ðŸ”§ Generating synthetic messy data...

âœ… Injecte

In [4]:
"""
Combine all labels into one master file for easy loading
"""

master_labels_path = '../data/synthetic/labels/master_labels.json'

with open(master_labels_path, 'w') as f:
    json.dump(all_labels, f, indent=2)

print(f"âœ… Master labels saved: {master_labels_path}")
print(f"   Total labeled files: {len(all_labels)}")

âœ… Master labels saved: ../data/synthetic/labels/master_labels.json
   Total labeled files: 15


In [8]:
"""
Quick check to make sure everything worked
"""

from pathlib import Path

# Load one messy file
sample_file = Path('../data/synthetic/messy/messy_001_titanic_v0.csv')
sample_df = pd.read_csv(sample_file)

print("ðŸ“„ Sample messy file:")
print(f"   File: {sample_file}")
print(f"   Shape: {sample_df.shape}")
print(f"\nFirst few rows:")
print(sample_df.head())

# Construct label file path properly
label_file = Path('../data/synthetic/labels') / sample_file.name.replace('.csv', '.json')
print(f"\nLooking for labels at: {label_file}")

with open(label_file, 'r') as f:
    sample_labels = json.load(f)

print(f"\nðŸ“Š Labels for this file:")
print(json.dumps(sample_labels['problems_injected'], indent=2))

print(f"\nâœ… Success! Generated messy data with labels")

ðŸ“„ Sample messy file:
   File: ..\data\synthetic\messy\messy_001_titanic_v0.csv
   Shape: (980, 15)

First few rows:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0       0.0     3.0    male  22.0    1.0    0.0   7.2500        S  Third   
1       1.0     1.0  female  38.0    1.0    0.0  71.2833        C  First   
2       1.0     3.0  FEMALE  26.0    NaN    0.0      NaN        s  Third   
3       1.0     1.0  female  35.0    1.0    0.0  53.1000        S  First   
4       0.0     3.0    male  35.0    0.0    0.0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C          NaN   yes  False  
4    man        True  NaN          NaN    no   True  

Looking for labels at: ..\data\synthetic\labels\messy_001_titanic_v0.json

ðŸ“Š Labels for this fil