In [8]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import os

RANDOM_STATE = 42
N_NEIGHBORS = 5
N_SYNTHETIC_SAMPLES_TO_GENERATE = 250000

In [9]:
IMBALANCED_DATA_PATH = '../data/01_raw/original_imbalanced.csv'
PROCESSED_DATA_DIR = '../data/02_processed/'
GRAPH_DRIVEN_SYNTHETIC_PATH = os.path.join(PROCESSED_DATA_DIR, 'graph_driven_synthetic.csv')

In [10]:
# Load and prepare data
df_imbalanced = pd.read_csv(IMBALANCED_DATA_PATH)
df_train = df_imbalanced.sample(frac=0.7, random_state=RANDOM_STATE)
df_minority = df_train[df_train['target'] == 1].drop('target', axis=1)
df_majority = df_train[df_train['target'] == 0]

numerical_cols = df_minority.select_dtypes(include=np.number).columns
scaler = StandardScaler()
df_minority_scaled = scaler.fit_transform(df_minority[numerical_cols])

In [11]:
print("Building kNN graph from minority samples...")
nn = NearestNeighbors(n_neighbors=N_NEIGHBORS + 1, algorithm='ball_tree')
nn.fit(df_minority_scaled)
distances, indices = nn.kneighbors(df_minority_scaled)

print(f"Generating {N_SYNTHETIC_SAMPLES_TO_GENERATE} synthetic samples...")
synthetic_samples = []
minority_numpy = df_minority[numerical_cols].to_numpy()

Building kNN graph from minority samples...
Generating 250000 synthetic samples...


In [12]:
for _ in range(N_SYNTHETIC_SAMPLES_TO_GENERATE):
    random_node_idx = np.random.randint(0, len(df_minority_scaled))

    random_neighbor_idx = np.random.choice(indices[random_node_idx][1:])

    p1 = minority_numpy[random_node_idx]
    p2 = minority_numpy[random_neighbor_idx]
    interpolation_ratio = np.random.rand()
    new_sample_numerical = p1 * interpolation_ratio + p2 * (1 - interpolation_ratio)

    new_sample_categorical = df_minority.iloc[random_node_idx].drop(numerical_cols)

    new_sample = pd.concat(
        [pd.Series(new_sample_numerical, index=numerical_cols), new_sample_categorical]
    )
    new_sample['target'] = 1
    synthetic_samples.append(new_sample)

df_synthetic = pd.DataFrame(synthetic_samples)[df_train.columns]  # enforce column order
df_graph_driven = pd.concat([df_majority, df_synthetic]).sample(frac=1, random_state=RANDOM_STATE)

df_graph_driven.to_csv(GRAPH_DRIVEN_SYNTHETIC_PATH, index=False)
print(f"Saved graph driven sunthetic dataset to {GRAPH_DRIVEN_SYNTHETIC_PATH}")

Saved graph driven sunthetic dataset to ../data/02_processed/graph_driven_synthetic.csv
