# 150M-count Synthetic Iris Dataset â€” Python

## Libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

## Load the Iris dataset

In [11]:
iris = load_iris(as_frame=True)
iris_df = iris.frame

## Separate data by species

In [12]:
setosa = iris_df[iris_df['target'] == 0].drop('target', axis=1)
versicolor = iris_df[iris_df['target'] == 1].drop('target', axis=1)
virginica = iris_df[iris_df['target'] == 2].drop('target', axis=1)

def generate_synthetic_data(original_df, num_samples):
    """Generates synthetic data with the same statistical properties as the input DataFrame."""
    synthetic_data = {}
    for col in original_df.columns:
        mean = original_df[col].mean()
        std = original_df[col].std()
        synthetic_data[col] = np.random.normal(loc=mean, scale=std, size=num_samples)
    return pd.DataFrame(synthetic_data)

## Generate synthetic data for each species

In [13]:
n_samples_per_species = 50_000_000  # Distribute the 150 million samples evenly

synthetic_setosa = generate_synthetic_data(setosa, n_samples_per_species)
synthetic_versicolor = generate_synthetic_data(versicolor, n_samples_per_species)
synthetic_virginica = generate_synthetic_data(virginica, n_samples_per_species)

## Add the target variable back

In [14]:
synthetic_setosa['target'] = 0
synthetic_versicolor['target'] = 1
synthetic_virginica['target'] = 2

## Combine the synthetic datasets

In [15]:
synthetic_iris = pd.concat([synthetic_setosa, synthetic_versicolor, synthetic_virginica], ignore_index=True)

print(f"Shape of the synthetic Iris dataset: {synthetic_iris.shape}")
synthetic_iris.head()

# You would then want to verify the statistical properties of synthetic_iris
# against the original iris_df

Shape of the synthetic Iris dataset: (150000000, 5)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.546403,3.42098,1.405668,0.095664,0
1,4.435805,3.161204,1.437168,0.261106,0
2,5.402873,3.18089,1.346026,0.17463,0
3,4.370459,3.780608,1.570134,0.154849,0
4,5.19734,2.911535,1.263559,0.257178,0


## Describe Both

### Original Iris Dataset

In [16]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


### Synthetic Iris Dataset

In [17]:
synthetic_iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150000000.0,150000000.0,150000000.0,150000000.0,150000000.0
mean,5.843443,3.057337,3.758005,1.199331,1.0
std,0.8285518,0.4370409,1.760438,0.7602552,0.8164966
min,2.398222,1.013755,0.4833981,-0.3154583,0.0
25%,5.161807,2.747923,1.579206,0.3170724,0.0
50%,5.80749,3.025529,4.249221,1.323341,1.0
75%,6.446017,3.34241,5.215454,1.844526,2.0
max,10.09893,5.517331,8.595169,3.638684,2.0


In [19]:
import os
iris_small_file = "iris.csv"
iris_big_file = "iris-150m.csv"
if not os.path.exists(iris_small_file):
    iris_df.to_csv(iris_small_file, index=False)
    print(f"Small Iris data saved to '{iris_small_file}'.")
if not os.path.exists(iris_big_file):
    synthetic_iris.to_csv(iris_big_file, index=False)
    print(f"Big Iris dataset saved to '{iris_big_file}'.")

Small Iris data saved to 'iris.csv'.
Big Iris dataset saved to 'iris-150m.csv'.


In [None]:
!ls -l iris*