In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
import os

In [2]:
# Config
N_SAMPLES = 128000
N_FEATURES = 40
N_INFORMATIVE = 15
N_REDUDANT = 5
N_CATEGORICAL = 10
CLASS_SEP = 0.8
FLIP_Y = 0.05
IMBALANCE_RATIO = 0.08

RANDOM_STATE = 42

In [3]:
RAW_DATA_DIR = '../data/01_raw/'
BALANCED_DATA_PATH = os.path.join(RAW_DATA_DIR, 'original_balanced.csv')
IMBALANCED_DATA_PATH = os.path.join(RAW_DATA_DIR, 'original_imbalanced.csv')

os.makedirs(RAW_DATA_DIR, exist_ok=True)

# Generate the Core Numerical Dataset

In [4]:
print("Generating core numerical features...")
X, y = make_classification(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES - N_CATEGORICAL,
    n_informative=N_INFORMATIVE,
    n_redundant=N_REDUDANT,
    n_classes=2,
    class_sep=CLASS_SEP,
    flip_y=FLIP_Y,
    weights=[0.5, 0.5],
    random_state=RANDOM_STATE
)

df_numerical = pd.DataFrame(X, columns=[f'num_{i}' for i in range(X.shape[1])])
df_target = pd.DataFrame(y, columns=['target'])

Generating core numerical features...


# Generate Categorical Features

In [5]:
print("Generating categorical features...")
df_categorical = pd.DataFrame()
for i in range(N_CATEGORICAL):
    num_categories = np.random.randint(3, 15)
    categories = [f'cat_{i}_val_{j}' for j in range(num_categories)]

    # Generate random categorical data
    cat_data = np.random.choice(categories, size=N_SAMPLES)
    df_categorical[f'cat_{i}'] = pd.Series(cat_data, dtype='category')

Generating categorical features...


# Combine into the final balanced dataset

In [6]:
df_balanced = pd.concat([df_numerical, df_categorical, df_target], axis=1)
print(f"Generated balanced dataset with shape: {df_balanced.shape}")
print("Balanced class distribution:")
print(df_balanced['target'].value_counts(normalize=True))

# Save the dataset
df_balanced.to_csv(BALANCED_DATA_PATH, index=False)
print(f"Saved balanced dataset to {BALANCED_DATA_PATH}")

Generated balanced dataset with shape: (128000, 41)
Balanced class distribution:
target
1    0.500078
0    0.499922
Name: proportion, dtype: float64
Saved balanced dataset to ../data/01_raw/original_balanced.csv


# Create the Imbalanced version

In [7]:
print("\nCreating imbalanced dataset...")
df_majority = df_balanced[df_balanced['target'] == 0]
df_minority = df_balanced[df_balanced['target'] == 1]

# Undersample the minority class
n_minority_new = int(len(df_majority) * IMBALANCE_RATIO / (1 - IMBALANCE_RATIO))
df_minority_sampled = df_minority.sample(n=n_minority_new, random_state=RANDOM_STATE)

df_imbalanced = pd.concat([df_majority, df_minority_sampled]).sample(frac=1, random_state=RANDOM_STATE)

print(f"Generated imbalanced dataset with shape: {df_imbalanced.shape}")
print("Imbalanced class distribution:")
print(df_imbalanced['target'].value_counts(normalize=True))

# Save the imbalanced dataset
df_imbalanced.to_csv(IMBALANCED_DATA_PATH, index=False)
print(f"Saved imbalanced dataset to {IMBALANCED_DATA_PATH}")


Creating imbalanced dataset...
Generated imbalanced dataset with shape: (69554, 41)
Imbalanced class distribution:
target
0    0.920005
1    0.079995
Name: proportion, dtype: float64
Saved imbalanced dataset to ../data/01_raw/original_imbalanced.csv
