In [1]:

# Preparing Data
import pandas as pd
import numpy as np
import statistics
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error,mean_squared_log_error, roc_auc_score, accuracy_score, f1_score, precision_recall_curve, log_loss
# Load the rock mines dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
df = pd.read_csv(url)
column_names = ["sensor_" + str(i) for i in range(1, 61)] + ["target"]
df.columns = column_names
df = df.reset_index()
df = df.rename(columns={"index": "id"})
df['target'] = df['target'].map({'M': 1, 'R': 0})
x_data = df.iloc[:, :62]
x_data = x_data.drop('id', axis=1)
x_data = x_data.drop('target', axis=1)
y_data = df['target']

In [2]:
from sdv.metadata import SingleTableMetadata
import xgboost as xgb
from sdv.single_table import GaussianCopulaSynthesizer
from sklearn.ensemble import RandomForestRegressor
kf = StratifiedKFold(n_splits=10, random_state=None)
model_rf = xgb.XGBClassifier()
model_synth = xgb.XGBClassifier()
results_baseline = []
results_synth = []
for train_idx, val_idx in kf.split(x_data, y_data):
    X_train = x_data.iloc[train_idx]
    y_train = y_data.iloc[train_idx]
    X_val = x_data.iloc[val_idx]
    y_val = y_data.iloc[val_idx]
    #Baseline
    model_rf.fit(X_train,  y_train)
    y_pred = model_rf.predict(X_val)
    score = accuracy_score( y_val, y_pred)
    results_baseline.append(score)
    #Sythetic Data
    x_data_train_idx = pd.merge(X_train, y_train, left_index=True, right_index=True)
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(x_data_train_idx)
    synthesizer = GaussianCopulaSynthesizer(metadata)
    synthesizer.fit(x_data_train_idx)
    x_data_synth = synthesizer.sample(num_rows=200)
    y_train_synth = x_data_synth.iloc[:,-1]
    x_train_synth = x_data_synth.iloc[:, :60]
    model_synth.fit(x_train_synth, y_train_synth)
    y_pred = model_synth.predict(X_val)
    score = accuracy_score( y_val, y_pred)
    results_synth.append(score)
print(f'Score Baseline: {statistics.mean(results_baseline):.2f} (+/- {statistics.stdev(results_baseline) * 2:.2f})')
print(f'Score Synthetic Data: {statistics.mean(results_synth):.2f} (+/- {statistics.stdev(results_synth) * 2:.2f})')


Score Baseline: 0.76 (+/- 0.22)
Score Synthetic Data: 0.64 (+/- 0.16)
