In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import os
import time
from typing import Tuple, Optional


RANDOM_STATE = 42
N_NEIGHBORS = 5
N_SYNTHETIC_SAMPLES_TO_GENERATE = 250000
BATCH_SIZE = 10000  


In [14]:
IMBALANCED_DATA_PATH = '../data/01_raw/original_imbalanced.csv'
PROCESSED_DATA_DIR = '../data/02_processed/'
GRAPH_DRIVEN_SYNTHETIC_PATH = os.path.join(PROCESSED_DATA_DIR, 'graph_driven_synthetic.csv')

In [None]:
# Prepare Data
def prepare_data(df_imbalanced: pd.DataFrame, random_state: int = 42) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, np.ndarray, list]:
    print("Preparing data...")
    
    # Split data
    df_train = df_imbalanced.sample(frac=0.7, random_state=random_state)
    df_minority = df_train[df_train['target'] == 1].drop('target', axis=1)
    df_majority = df_train[df_train['target'] == 0]
    
    # Identify numerical columns
    numerical_cols = df_minority.select_dtypes(include=np.number).columns.tolist()
    
    # Scale numerical features
    scaler = StandardScaler()
    df_minority_scaled = scaler.fit_transform(df_minority[numerical_cols])
    
    print(f"Minority samples: {len(df_minority)}")
    print(f"Majority samples: {len(df_majority)}")
    print(f"Numerical features: {len(numerical_cols)}")
    
    return df_minority, df_majority, df_train, df_minority_scaled, numerical_cols

# Load data
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
df_imbalanced = pd.read_csv(IMBALANCED_DATA_PATH)
print(f"Loaded {len(df_imbalanced)} samples with {len(df_imbalanced.columns)} features")

df_minority, df_majority, df_train, df_minority_scaled, numerical_cols = prepare_data(df_imbalanced, RANDOM_STATE)


Loaded 69554 samples with 41 features
Preparing data...
Minority samples: 3900
Majority samples: 44788
Numerical features: 30


In [None]:
# Build kNN Graph
def build_knn_graph(df_minority_scaled: np.ndarray, n_neighbors: int) -> Tuple[np.ndarray, np.ndarray]:
    print(f"Building kNN graph with {n_neighbors} neighbors...")
    
    nn = NearestNeighbors(n_neighbors=n_neighbors + 1, algorithm='ball_tree', n_jobs=-1)
    nn.fit(df_minority_scaled)
    distances, indices = nn.kneighbors(df_minority_scaled)
    
    print("kNN graph built successfully")
    return distances, indices

distances, indices = build_knn_graph(df_minority_scaled, N_NEIGHBORS)


Building kNN graph with 5 neighbors...
kNN graph built successfully


In [None]:
# Generate Synthetic Samples Batch
def generate_synthetic_batch(
    df_minority: pd.DataFrame,
    indices: np.ndarray,
    numerical_cols: list,
    batch_size: int,
    random_state: Optional[int] = None
) -> pd.DataFrame:
    
    if random_state is not None:
        np.random.seed(random_state)
    
    minority_numerical = df_minority[numerical_cols].values
    minority_categorical = df_minority.drop(columns=numerical_cols)
    n_minority_samples = len(df_minority)
    
    random_node_indices = np.random.randint(0, n_minority_samples, size=batch_size)
    synthetic_numerical = np.zeros((batch_size, len(numerical_cols)))
    categorical_indices = np.zeros(batch_size, dtype=int)
    
    for i, node_idx in enumerate(random_node_indices):
        neighbor_idx = np.random.choice(indices[node_idx][1:])
        interpolation_ratio = np.random.rand()
        synthetic_numerical[i] = (
            minority_numerical[node_idx] * interpolation_ratio + 
            minority_numerical[neighbor_idx] * (1 - interpolation_ratio)
        )
        categorical_indices[i] = node_idx
    
    synthetic_df = pd.DataFrame(synthetic_numerical, columns=numerical_cols)
    
    if len(minority_categorical.columns) > 0:
        categorical_data = minority_categorical.iloc[categorical_indices].reset_index(drop=True)
        synthetic_df = pd.concat([synthetic_df, categorical_data], axis=1)
    
    synthetic_df['target'] = 1
    
    return synthetic_df


In [None]:
# Generate Synthetic Samples Optimized (Batch Processing)
def generate_synthetic_samples_optimized(
    df_minority: pd.DataFrame,
    df_majority: pd.DataFrame,
    df_train: pd.DataFrame,
    indices: np.ndarray,
    numerical_cols: list,
    n_samples: int,
    batch_size: int = 10000,
    random_state: int = 42
) -> pd.DataFrame:
    
    print(f"Generating {n_samples} synthetic samples in batches of {batch_size}...")
    
    synthetic_batches = []
    n_batches = (n_samples + batch_size - 1) // batch_size
    
    np.random.seed(random_state)
    
    for batch_idx in range(n_batches):
        start_time = time.time()
        current_batch_size = min(batch_size, n_samples - batch_idx * batch_size)
        batch_random_state = random_state + batch_idx
        synthetic_batch = generate_synthetic_batch(df_minority, indices, numerical_cols, current_batch_size, batch_random_state)
        synthetic_batches.append(synthetic_batch)
        print(f"Batch {batch_idx + 1}/{n_batches} completed in {time.time() - start_time:.2f}s ({current_batch_size} samples)")
    
    df_synthetic = pd.concat(synthetic_batches, ignore_index=True)
    df_synthetic = df_synthetic[df_train.columns]
    df_final = pd.concat([df_majority, df_synthetic], ignore_index=True)
    df_final = df_final.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return df_final

df_graph_driven = generate_synthetic_samples_optimized(
    df_minority, df_majority, df_train, indices, numerical_cols,
    N_SYNTHETIC_SAMPLES_TO_GENERATE, BATCH_SIZE, RANDOM_STATE
)


Generating 250000 synthetic samples in batches of 10000...
Batch 1/25 completed in 0.11s (10000 samples)
Batch 2/25 completed in 0.10s (10000 samples)
Batch 3/25 completed in 0.09s (10000 samples)
Batch 4/25 completed in 0.09s (10000 samples)
Batch 5/25 completed in 0.11s (10000 samples)
Batch 6/25 completed in 0.09s (10000 samples)
Batch 7/25 completed in 0.09s (10000 samples)
Batch 8/25 completed in 0.12s (10000 samples)
Batch 9/25 completed in 0.09s (10000 samples)
Batch 10/25 completed in 0.09s (10000 samples)
Batch 11/25 completed in 0.11s (10000 samples)
Batch 12/25 completed in 0.09s (10000 samples)
Batch 13/25 completed in 0.09s (10000 samples)
Batch 14/25 completed in 0.11s (10000 samples)
Batch 15/25 completed in 0.08s (10000 samples)
Batch 16/25 completed in 0.07s (10000 samples)
Batch 17/25 completed in 0.08s (10000 samples)
Batch 18/25 completed in 0.08s (10000 samples)
Batch 19/25 completed in 0.07s (10000 samples)
Batch 20/25 completed in 0.08s (10000 samples)
Batch 21/2

In [None]:
# Save & Inspect Results
df_graph_driven.to_csv(GRAPH_DRIVEN_SYNTHETIC_PATH, index=False)
print(f"Synthetic dataset saved to {GRAPH_DRIVEN_SYNTHETIC_PATH}")

original_minority = len(df_minority)
final_minority = len(df_graph_driven[df_graph_driven['target'] == 1])
final_majority = len(df_graph_driven[df_graph_driven['target'] == 0])

print("\n" + "="*60)
print("GRAPH-DRIVEN SYNTHETIC DATA GENERATION COMPLETE")
print("="*60)
print(f"Original minority samples: {original_minority:,}")
print(f"Generated synthetic samples: {N_SYNTHETIC_SAMPLES_TO_GENERATE:,}")
print(f"Final dataset composition:")
print(f"  - Minority class (target=1): {final_minority:,} samples")
print(f"  - Majority class (target=0): {final_majority:,} samples")
print(f"  - Total samples: {len(df_graph_driven):,}")
print(f"  - Class ratio: {final_minority/final_majority:.3f}")


Synthetic dataset saved to ../data/02_processed/graph_driven_synthetic.csv

GRAPH-DRIVEN SYNTHETIC DATA GENERATION COMPLETE
Original minority samples: 3,900
Generated synthetic samples: 250,000
Final dataset composition:
  - Minority class (target=1): 250,000 samples
  - Majority class (target=0): 44,788 samples
  - Total samples: 294,788
  - Class ratio: 5.582
