# Synthetic Data Generation 

## A. On Penguin Dataset

In [12]:
import pipeline_datasynthesizer
import pipeline_smote

DATASET1 = "penguins"
TARGET1 = "sex"
CATEGORICAL_COLS1 = ['species', 'island']

### i. DataSynthesizer Independent Mode

In [13]:
proportions = [0.02, 0.05, 0.1, 0.2, 0.25, 0.3]
for proportion in proportions:
    pipeline_datasynthesizer.process_minority_proportion(
        input_file=f'data/{DATASET1}/original/train.csv',
        target_col=TARGET1,
        target_proportion=proportion,
        output_dir=f'data/{DATASET1}/independent',
        categorical_cols=CATEGORICAL_COLS1,
        mode='independent'
    )

Created data/penguins/independent/minority_2pct.csv (minority proportion: 2.0%)
Created data/penguins/independent/minority_5pct.csv (minority proportion: 5.0%)
Created data/penguins/independent/minority_10pct.csv (minority proportion: 10.0%)
Created data/penguins/independent/minority_20pct.csv (minority proportion: 20.0%)
Created data/penguins/independent/minority_25pct.csv (minority proportion: 25.0%)
Created data/penguins/independent/minority_30pct.csv (minority proportion: 30.0%)


### ii. DataSynthesizer Correlated Mode

In [14]:
proportions = [0.02, 0.05, 0.1, 0.2, 0.25, 0.3]
for proportion in proportions:
    pipeline_datasynthesizer.process_minority_proportion(
        input_file=f'data/{DATASET1}/original/train.csv',
        target_col=TARGET1,
        target_proportion=proportion,
        output_dir=f'data/{DATASET1}/correlated',
        categorical_cols=CATEGORICAL_COLS1,
        mode='correlated'
    )

Adding ROOT culmen_depth_mm


Adding attribute culmen_length_mm
Adding attribute sex
Created data/penguins/correlated/minority_2pct.csv (minority proportion: 2.0%)
Adding ROOT culmen_depth_mm
Adding attribute culmen_length_mm
Adding attribute flipper_length_mm
Adding attribute species
Adding attribute island
Adding attribute sex
Created data/penguins/correlated/minority_5pct.csv (minority proportion: 5.0%)
Adding ROOT sex
Adding attribute species
Adding attribute culmen_depth_mm
Adding attribute culmen_length_mm
Adding attribute body_mass_g
Adding attribute flipper_length_mm
Adding attribute island
Created data/penguins/correlated/minority_10pct.csv (minority proportion: 10.0%)
Adding ROOT sex
Adding attribute species
Adding attribute body_mass_g
Adding attribute culmen_depth_mm
Adding attribute flipper_length_mm
Adding attribute culmen_length_mm
Adding attribute island
Created data/penguins/correlated/minority_20pct.csv (minority proportion: 20.0%)
Adding ROOT sex
Adding attribute species
Adding attribute flipper_

### iii. SMOTE

In [15]:
for proportion in proportions:
    pipeline_smote.process_minority_proportion(
        input_file=f'data/{DATASET1}/original/train.csv',
        target_col=TARGET1,
        target_proportion=proportion,
        output_dir=f'data/{DATASET1}/smote',
        categorical_cols=CATEGORICAL_COLS1
    )

Created data/penguins/smote/minority_2pct_smote.csv (initial minority proportion: 2.0%, balanced to 50-50 with SMOTE)
Created data/penguins/smote/minority_5pct_smote.csv (initial minority proportion: 5.0%, balanced to 50-50 with SMOTE)
Created data/penguins/smote/minority_10pct_smote.csv (initial minority proportion: 10.0%, balanced to 50-50 with SMOTE)
Created data/penguins/smote/minority_20pct_smote.csv (initial minority proportion: 20.0%, balanced to 50-50 with SMOTE)
Created data/penguins/smote/minority_25pct_smote.csv (initial minority proportion: 25.0%, balanced to 50-50 with SMOTE)
Created data/penguins/smote/minority_30pct_smote.csv (initial minority proportion: 30.0%, balanced to 50-50 with SMOTE)


## B. On Fitness Dataset

In [16]:

import pipeline_datasynthesizer
import pipeline_smote

DATASET2 = "fitness"
TARGET2 = "is_fit"
CATEGORICAL_COLS2 = ['smokes', 'gender']

### i. DataSynthesizer Independent Mode

In [17]:
proportions = [0.02, 0.05, 0.1, 0.2, 0.25, 0.3]
for proportion in proportions:
    pipeline_datasynthesizer.process_minority_proportion(
        input_file=f'data/{DATASET2}/original/train.csv',
        target_col=TARGET2,
        target_proportion=proportion,
        output_dir=f'data/{DATASET2}/independent',
        categorical_cols=CATEGORICAL_COLS2,
        mode='independent'
    )

Created data/fitness/independent/minority_2pct.csv (minority proportion: 2.0%)
Created data/fitness/independent/minority_5pct.csv (minority proportion: 5.0%)
Created data/fitness/independent/minority_10pct.csv (minority proportion: 10.0%)
Created data/fitness/independent/minority_20pct.csv (minority proportion: 20.0%)
Created data/fitness/independent/minority_25pct.csv (minority proportion: 25.0%)
Created data/fitness/independent/minority_30pct.csv (minority proportion: 30.0%)


### ii. DataSynthesizer Correlated Mode

In [18]:
proportions = [0.02, 0.05, 0.1, 0.2, 0.25, 0.3]
for proportion in proportions:
    pipeline_datasynthesizer.process_minority_proportion(
        input_file=f'data/{DATASET2}/original/train.csv',
        target_col=TARGET2,
        target_proportion=proportion,
        output_dir=f'data/{DATASET2}/correlated',
        categorical_cols=CATEGORICAL_COLS2,
        mode='correlated'
    )

Adding ROOT activity_index
Adding attribute heart_rate
Adding attribute nutrition_quality
Adding attribute blood_pressure
Adding attribute height_cm
Adding attribute age
Adding attribute sleep_hours
Adding attribute smokes
Adding attribute gender
Adding attribute is_fit
Created data/fitness/correlated/minority_2pct.csv (minority proportion: 2.0%)
Adding ROOT nutrition_quality
Adding attribute height_cm
Adding attribute sleep_hours
Adding attribute heart_rate
Adding attribute age
Adding attribute blood_pressure
Adding attribute activity_index
Adding attribute weight_kg
Adding attribute smokes
Adding attribute gender
Adding attribute is_fit
Created data/fitness/correlated/minority_5pct.csv (minority proportion: 5.0%)
Adding ROOT nutrition_quality
Adding attribute sleep_hours
Adding attribute age
Adding attribute height_cm
Adding attribute blood_pressure
Adding attribute activity_index
Adding attribute heart_rate
Adding attribute weight_kg
Adding attribute smokes
Adding attribute gender
A

### iii. SMOTE

In [19]:
for proportion in proportions:
    pipeline_smote.process_minority_proportion(
        input_file=f'data/{DATASET2}/original/train.csv',
        target_col=TARGET2,
        target_proportion=proportion,
        output_dir=f'data/{DATASET2}/smote',
        categorical_cols=CATEGORICAL_COLS2
    )

Created data/fitness/smote/minority_2pct_smote.csv (initial minority proportion: 2.0%, balanced to 50-50 with SMOTE)
Created data/fitness/smote/minority_5pct_smote.csv (initial minority proportion: 5.0%, balanced to 50-50 with SMOTE)
Created data/fitness/smote/minority_10pct_smote.csv (initial minority proportion: 10.0%, balanced to 50-50 with SMOTE)
Created data/fitness/smote/minority_20pct_smote.csv (initial minority proportion: 20.0%, balanced to 50-50 with SMOTE)
Created data/fitness/smote/minority_25pct_smote.csv (initial minority proportion: 25.0%, balanced to 50-50 with SMOTE)
Created data/fitness/smote/minority_30pct_smote.csv (initial minority proportion: 30.0%, balanced to 50-50 with SMOTE)
