In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from pydvl.value import compute_data_oob
from pydvl.utils import Dataset, Utility, Scorer
from pydvl.reporting.scores import compute_removal_score
from pydvl.reporting.plots import shaded_mean_std, plot_best_worst, plot_best_worst_class_imbalance, compute_best_worst_scores, plot_methods_linreg
from pydvl.value.result import ValuationResult

from sklearn.datasets import load_iris, fetch_kddcup99, fetch_openml
import pandas as pd

In [5]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

In [6]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }
    },
    "primary_key": "guest_e

In [44]:
import pandas as pd
from sklearn.datasets import load_iris

data = Dataset.from_sklearn(load_iris())

In [48]:
from sklearn.mixture import GaussianMixture

In [64]:
indexes = np.arange(100)
indexes

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [65]:
import random

In [95]:
def sample_bag(indexes, max_samples):
    n_samp = int(max_samples*len(indexes))
    return(random.choices(indexes, k=n_samp))

In [None]:
sample_bag(indexes, 0.3)

In [98]:
n_estimators = 4
max_samples = 0.6

bags_indexes = [sample_bag(indexes, 0.05) for _ in range(n_estimators)]
bags_indexes

[[41, 40, 84, 74, 16],
 [47, 98, 62, 88, 76],
 [27, 76, 84, 0, 50],
 [47, 8, 78, 86, 37]]

In [105]:
estimators = [GaussianMixture().fit(data.x_train[indx]) for indx in bags_indexes]
estimators
    

[GaussianMixture(), GaussianMixture(), GaussianMixture(), GaussianMixture()]

In [118]:
estimators_AD = [IsolationForest().fit(data.x_train[indx]) for indx in bags_indexes]
estimators_AD
    

[IsolationForest(), IsolationForest(), IsolationForest(), IsolationForest()]

In [110]:

from sklearn.ensemble import IsolationForest

In [119]:
for est, bag_idx in zip(estimators_AD, bags_indexes):
    oob_idxs = np.setxor1d(indexes, bag_idx)
    oob_points = data.x_train[oob_idxs]
    print(est.predict(oob_points))
    break

[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
 -1  1  1  1  1  1 -1 -1  1  1  1  1 -1 -1  1  1  1  1  1  1 -1  1  1  1
  1  1  1  1 -1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1 -1  1  1
 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1 -1 -1  1  1]


In [None]:
for est, bag_idx in zip(estimators, bags_indexes):
    oob_idxs = np.setxor1d(indexes, bag_idx)
    oob_points = data.x_train[oob_idxs]
    ad = IsolationForest().fit(oob_points)
    test_pt = est.sample(1)
    ad_pred = ad.predict(test_pt[0])
    print(ad_pred)
    print(test_pt)
#    print(est.predict([oob_p]))
    break
#    est.predict()

[-1]
(array([[5.55101003, 3.54343267, 3.49945813, 1.36134952]]), array([0]))


In [90]:
%%timeit
sample_bag(indexes, 40)

8.67 µs ± 663 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [None]:

def compute_data_oob_generative(
    u: Utility,
    n_est: int = 10,
    max_samples: float = 0.8,
    n_jobs: int = None,
    loss: Callable = None,
    *,
    progress: bool = False,
) -> ValuationResult:
    r"""
    """

    result: ValuationResult[np.int_, np.float_] = ValuationResult.empty(
        algorithm="data_oob", indices=u.data.indices, data_names=u.data.data_names
    )

    bag = BaggingClassifier(
        u.model, n_estimators=n_est, max_samples=max_samples, n_jobs=n_jobs
    )
    loss = point_wise_accuracy

    bag.fit(u.data.x_train, u.data.y_train)

    for est, samples in maybe_progress(
        zip(bag.estimators_, bag.estimators_samples_), progress, total=n_est
    ):  # The bottleneck is the bag fitting not this part so TQDM is not very useful here
        oob_idx = np.setxor1d(u.data.indices, np.unique(samples))
        array_loss = loss(
            preds=est.predict(u.data.x_train[oob_idx]), y=u.data.y_train[oob_idx]
        )
        result += ValuationResult(
            algorithm="data_oob",
            indices=oob_idx,
            values=array_loss,
            counts=np.ones_like(array_loss, dtype=u.data.indices.dtype),
        )
    return result

In [60]:
m = GaussianMixture()
m.fit(data.x_train)
a = m.sample(100)
a[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [62]:
1=2

SyntaxError: cannot assign to literal (3293032332.py, line 1)

In [23]:
metadata = {"METADATA_SPEC_VERSION":"SINGLE_TABLE_V1",
            "columns":{col:{"sdtype":"numerical","computer_representation":"Float"} for col in data.feature_names}}

In [28]:
metadata

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'sepal length (cm)': {'sdtype': 'numerical',
   'computer_representation': 'Float'},
  'sepal width (cm)': {'sdtype': 'numerical',
   'computer_representation': 'Float'},
  'petal length (cm)': {'sdtype': 'numerical',
   'computer_representation': 'Float'},
  'petal width (cm)': {'sdtype': 'numerical',
   'computer_representation': 'Float'}}}

In [4]:
from sdv.single_table import GaussianCopulaSynthesizer

In [27]:
synthesizer = GaussianCopulaSynthesizer(metadata)

TypeError: __init__() missing 1 required positional argument: 'metadata'

In [25]:

synthesizer.fit(real_data)

AttributeError: 'dict' object has no attribute 'validate'

In [29]:
! python3 -m pip install ctgan



In [38]:
real_data.shape

(32561, 15)

In [42]:
from sklearn.mixture import GaussianMixture

In [43]:
m = GaussianMixture()
m.fit(real_data)

ValueError: could not convert string to float: ' Private'

In [39]:
from ctgan import CTGAN, TVAE
from ctgan import load_demo

real_data = load_demo().sample(1000)

# Names of the columns that are discrete
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'income'
]

#ctgan = CTGAN(epochs=10)
ctgan = TVAE()

ctgan.fit(real_data, discrete_columns)

# Create synthetic data
synthetic_data = ctgan.sample(1000)



In [40]:
import copulas


In [41]:
from copulas.multivariate import gaussian

In [34]:
import ctgan

In [36]:
ctgan.TVAE

ctgan.synthesizers.tvae.TVAE