In [1]:
# save as: create_test_data.py
import pandas as pd
import numpy as np

# Test Dataset 1: Clean data (for baseline)
df_baseline = pd.DataFrame({
    'id': range(1, 101),
    'name': [f'User_{i}' for i in range(1, 101)],
    'age': np.random.randint(20, 70, 100),
    'salary': np.random.randint(30000, 120000, 100),
    'department': np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing'], 100),
    'score': np.random.uniform(0, 100, 100).round(2)
})

df_baseline.to_csv('baseline_data.csv', index=False)
print("✓ Created baseline_data.csv")

# Test Dataset 2: Data with issues (for comparison)
df_current = pd.DataFrame({
    'id': range(1, 121),  # More rows
    'name': [f'User_{i}' if i % 10 != 0 else None for i in range(1, 121)],  # Missing values
    'age': np.random.randint(20, 70, 120),
    'salary': np.random.randint(30000, 120000, 120),
    'department': np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing', 'IT'], 120),  # Extra category
    'score': np.random.uniform(0, 100, 120).round(2),
    'new_column': np.random.choice(['A', 'B', 'C'], 120)  # Extra column
})

df_current.to_csv('current_data.csv', index=False)
print("✓ Created current_data.csv")

# Test Dataset 3: Small file (should fail validation)
df_small = pd.DataFrame({
    'col1': [1, 2, 3],  # Only 3 rows (less than MIN_ROWS=10)
    'col2': ['a', 'b', 'c']
})

df_small.to_csv('small_data.csv', index=False)
print("✓ Created small_data.csv")

print("\n✅ All test datasets created!")

✓ Created baseline_data.csv
✓ Created current_data.csv
✓ Created small_data.csv

✅ All test datasets created!
