# How to generate a combined utility/privacy report?

### Create a combined report of the metrics, whether they are utility or privacy metrics. /!\ Only for the summary.

Assume that the synthetic data is already generated \
Based on the Wisconsin Breast Cancer Dataset (WBCD)

In [1]:
# Standard library
import sys
import tempfile

sys.path.append("..")

# 3rd party packages
import pandas as pd

# Local packages
import config
import utils.draw
from metrics.report import Report

## Load the real and synthetic Wisconsin Breast Cancer Datasets

In [2]:
df_real = {}
df_real["train"] = pd.read_csv("../data/WBCD_train.csv")
df_real["test"] = pd.read_csv("../data/WBCD_test.csv")
df_real["train"].shape

(455, 10)

### Choose the synthetic dataset

In [3]:
df_synth = {}
df_synth["train"] = pd.read_csv("../results/data/2024-02-15_Synthpop_455samples.csv")
df_synth["test"] = pd.read_csv("../results/data/2024-02-15_Synthpop_228samples.csv")
df_synth["2nd_gen"] = pd.read_csv(
    "../results/data/2024-02-15_Synthpop_455samples_2nd_gen.csv"
)
df_synth["test"].shape

(228, 10)

## Configure the metadata dictionary

### The continuous and categorical variables need to be specified, as well as the variable to predict

In [4]:
metadata = {
    "continuous": [
        "Clump_Thickness",
        "Uniformity_of_Cell_Size",
        "Uniformity_of_Cell_Shape",
        "Marginal_Adhesion",
        "Single_Epithelial_Cell_Size",
        "Bland_Chromatin",
        "Normal_Nucleoli",
        "Mitoses",
        "Bare_Nuclei",
    ],
    "categorical": ["Class"],
    "variable_to_predict": "Class",
}

## Generate the report

In [5]:
parameters = {  # see the notebooks utility_report and privacy_report for more details
    "cross_learning": False,
    "num_repeat": 1,
    "num_kfolds": 3,
    "num_optuna_trials": 15,
    "use_gpu": True,
    "sampling_frac": 0.5,
}

In [6]:
report = Report(
    dataset_name="Wisconsin Breast Cancer Dataset",
    df_real=df_real,
    df_synthetic=df_synth,
    metadata=metadata,
    figsize=(8, 6),  # will be automatically adjusted for larger or longer figures
    random_state=42,  # for reproducibility purposes
    report_folderpath=None,  # load computed utility and/or privacy reports if available
    report_filename=None,  # the name of the computed report (without extension nor utility/privacy) if available
    metrics=None,  # list of the metrics to compute. Can be utility or privacy metrics. If not specified, all the metrics are computed.
    params=parameters,  # the dictionary containing the parameters for both utility and privacy reports
)

In [7]:
report.compute()

LOGAN test set shape: (228, 10)
TableGan test set shape: (228, 10)
Detector test set shape: (228, 10)


## Get the summary report as a pandas dataframe

In [8]:
report.specification()

----- Wisconsin Breast Cancer Dataset -----
Contains:
    - 455 instances in the train set,
    - 228 instances in the test set,
    - 10 variables, 9 continuous and 1 categorical.


In [9]:
df_summary = report.summary()

In [10]:
by = ["name", "objective", "min", "max"]
df_summary.groupby(by).apply(lambda x: x.drop(by, axis=1).reset_index(drop=True))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,alias,submetric,value
name,objective,min,max,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Categorical Consistency,max,0,1.0,0,cat_consis,within_ratio,1.0
Categorical Statistics,max,0,1.0,0,cat_stats,support_coverage,1.0
Categorical Statistics,max,0,1.0,1,cat_stats,frequency_coverage,0.975824
Classification,min,0,1.0,0,classif,diff_real_synth,0.001478
Collision,max,0,1.0,0,collision,precision,0.460317
Collision,max,0,1.0,1,collision,recall,0.966667
Collision,max,0,1.0,2,collision,f1_score,0.623656
Collision,max,0,1.0,3,collision,recovery_rate,0.186495
Collision,max,0,inf,0,collision,avg_num_appearance_realtrain,1.463023
Collision,max,0,inf,1,collision,avg_num_appearance_realcontrol,1.349112


## Save and load the report

In [None]:
with tempfile.TemporaryDirectory() as temp_dir:
    report.save(savepath=temp_dir, filename="report")  # save
    new_report = Report(report_folderpath=temp_dir, report_filename="report")  # load