In [1]:
# DP-STOA

# Importing the necessary modules
from DataLoader import DataLoader
from Anonymisation import Anonymisation
from Consistenter import Consistenter
from PostProcessor import RecordPostprocessor
from GUM import GraduallyUpdateMethod

# Loading the data
dl = DataLoader('../datasets/UCIMLAdult/uciml_adult.csv', '../datasets/UCIMLAdult/data.yaml', '../datasets/UCIMLAdult/column_info.json', '../datasets/UCIMLAdult/loading_data.json')
dl.data_loader()
dl.all_indifs(dl.private_data)

# Anonymising the data
epsilon = 10
delta = 3e-11
anon = Anonymisation(epsilon=epsilon,delta=delta)
anon.anonymiser(dl)

# Consistenting the data
cons = Consistenter(anon, dl.all_attrs)
cons.make_consistent(iterations = 5)

# Synthesising the data
gum = GraduallyUpdateMethod(dl, cons)  
gum.initialiser(view_iterations = 100)
syn_data = gum.synthesize(iterations = 100, num_records = int(cons.num_synthesize_records))

# Post-processing the data
processor_private = RecordPostprocessor(dl.private_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
processor_public = RecordPostprocessor(syn_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
original_data = processor_private.post_process()
synthesised_dp_stoa_data = processor_public.post_process()


  self.private_data.fillna(self.fillna, inplace=True, downcast = self.datatypes)


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,3,1,0,2,1,4,0,1,1,0,4,1,1
1,4,2,0,0,2,3,0,1,0,0,2,1,1
2,3,3,1,1,3,4,0,1,0,0,4,1,1
3,4,3,2,0,3,3,4,1,0,0,4,1,1
4,2,3,0,0,4,1,4,0,0,0,4,2,1


epsilon: 10, delta: 3e-11, rho: 0.8616407135665874
one-way sigma: 8.685474957955632
indif sigma: 2.3399999999999998e-09
multi-way sigma: 5.838830013637393
Iteration 1 of 5 to make the marginals consistent and normalise them
Iteration 2 of 5 to make the marginals consistent and normalise them
Iteration 3 of 5 to make the marginals consistent and normalise them
Iteration 4 of 5 to make the marginals consistent and normalise them
Iteration 5 of 5 to make the marginals consistent and normalise them
Iteration 1 of 100 completed to consist the marginal views
Iteration 2 of 100 completed to consist the marginal views
Iteration 3 of 100 completed to consist the marginal views
Iteration 4 of 100 completed to consist the marginal views
Iteration 5 of 100 completed to consist the marginal views
Iteration 6 of 100 completed to consist the marginal views
Iteration 7 of 100 completed to consist the marginal views
Iteration 8 of 100 completed to consist the marginal views
Iteration 9 of 100 completed

In [10]:
# Evaluation Metric Functions
import pandas as pd
from sklearn.metrics import f1_score, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import  LabelEncoder, MinMaxScaler
from sklearn.svm import SVC
import numpy as np
from sklearn.ensemble import RandomForestRegressor

def evaluate_mle(data, categorical_cols=None):
    """
    General MLE score calculation for any dataset
    """
    if categorical_cols is None:
        categorical_cols = []
    
    # Prepare features
    X = data.copy()
    numerical_cols = [col for col in X.columns if col not in categorical_cols]
    
    # Scale numerical features
    scaler = MinMaxScaler()
    if numerical_cols:
        X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    
    f1_scores = []
    r2_scores = []
    
    # Classification tasks for categorical columns
    for target_col in categorical_cols:
        # Prepare features
        features = X.drop(columns=[target_col])
        
        # One-hot encode remaining categorical features
        cat_cols = [col for col in categorical_cols if col != target_col]
        if cat_cols:
            features = pd.get_dummies(features, columns=cat_cols)
        
        # Encode target
        le = LabelEncoder()
        y = le.fit_transform(X[target_col])
        
        # Split data
        split_idx = int(0.8 * len(features))
        X_train, X_test = features.iloc[:split_idx], features.iloc[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        # Train classifier
        model = RandomForestClassifier(
            n_estimators=100,  
            max_depth=None,    
            min_samples_split=2,
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    
    # Regression tasks for numerical columns
    for target_col in numerical_cols:
        # Prepare features
        features = X.drop(columns=[target_col])
        
        # One-hot encode categorical features
        if categorical_cols:
            features = pd.get_dummies(features, columns=categorical_cols)
        
        y = X[target_col].values
        
        # Split data
        split_idx = int(0.8 * len(features))
        X_train, X_test = features.iloc[:split_idx], features.iloc[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        # Train regressor
        model = RandomForestRegressor(
            n_estimators=100,
            max_depth=None,
            min_samples_split=2,
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2_scores.append(r2_score(y_test, y_pred))
    
    # Calculate final scores
    avg_f1 = np.mean(f1_scores) if f1_scores else 0
    avg_r2 = np.mean(r2_scores) if r2_scores else 0
    mle_score = (avg_f1 + avg_r2) / 2
    
    print(f"Average F1 Score ({len(f1_scores)} tasks): {avg_f1:.4f}")
    print(f"Average R² Score ({len(r2_scores)} tasks): {avg_r2:.4f}")
    print(f"MLE Score: {mle_score:.4f}")
    
    return mle_score

# MLE Score for original data
print("Original Adult Dataset MLE Evaluation:")
mle_score = evaluate_mle(
    data=original_data,
    categorical_cols=['workclass', 'education', 'marital-status', 'occupation', 
                     'relationship', 'race', 'sex', 'native-country', 'income']
)

# MLE Score for synthetic data
print("\nSynthesised DP STOA Adult Dataset MLE Evaluation:")
mle_score_synthetic = evaluate_mle(
    data=synthesised_dp_stoa_data,
    categorical_cols=['workclass', 'education', 'marital-status', 'occupation', 
                     'relationship', 'race', 'sex', 'native-country', 'income']
)

Original Adult Dataset MLE Evaluation:
Average F1 Score (9 tasks): 0.4230
Average R² Score (4 tasks): 0.0913
MLE Score: 0.2572

Synthesised DP STOA Adult Dataset MLE Evaluation:
Average F1 Score (9 tasks): 0.8288
Average R² Score (4 tasks): 0.7366
MLE Score: 0.7827
