In [17]:
# DP-STOA
import importlib
# Importing the necessary modules
from DataLoader import DataLoader
from Anonymisation import Anonymisation
from Consistenter import Consistenter
import PostProcessor
importlib.reload(PostProcessor)
from PostProcessor import RecordPostprocessor
from GUM import GraduallyUpdateMethod
from sklearn.preprocessing import MinMaxScaler

# normalisation function
def normalize_numerical_data(self):
    numerical_cols = [col for col in self.private_data.columns 
                     if self.private_data[col].dtype in ['int64', 'float64']]
    
    if numerical_cols:
        scaler = MinMaxScaler()
        self.private_data[numerical_cols] = scaler.fit_transform(self.private_data[numerical_cols])
        self.scaler = scaler  # Save for later denormalization

# Loading the data
dl = DataLoader('../datasets/UCIMLAdult/uciml_adult.csv', '../datasets/UCIMLAdult/data.yaml', '../datasets/UCIMLAdult/column_info.json', '../datasets/UCIMLAdult/loading_data.json')
dl.data_loader()
dl.all_indifs(dl.private_data)

# Anonymising the data
epsilon = 10
delta = 3e-11
anon = Anonymisation(epsilon=epsilon,delta=delta)
anon.anonymiser(dl)

# Consistenting the data
cons = Consistenter(anon, dl.all_attrs)
cons.make_consistent(iterations = 5)

# Synthesising the data
gum = GraduallyUpdateMethod(dl, cons)  
gum.initialiser(view_iterations = 100)
syn_data = gum.synthesize(iterations = 100, num_records = int(cons.num_synthesize_records))

# Post-processing the data
processor_private = RecordPostprocessor(dl.private_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
processor_public = RecordPostprocessor(syn_data, dl.configpath, dl.datainfopath, dl.decode_mapping)
original_adult_data = processor_private.post_process()
synthesised_dp_stoa_adult_data = processor_public.post_process()


  self.private_data.fillna(self.fillna, inplace=True, downcast = self.datatypes)


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,3,1,0,2,1,4,0,1,1,0,4,1,1
1,4,2,0,0,2,3,0,1,0,0,2,1,1
2,3,3,1,1,3,4,0,1,0,0,4,1,1
3,4,3,2,0,3,3,4,1,0,0,4,1,1
4,2,3,0,0,4,1,4,0,0,0,4,2,1


epsilon: 10, delta: 3e-11, rho: 0.8616407135665874
one-way sigma: 8.685474957955632
indif sigma: 2.3399999999999998e-09
multi-way sigma: 5.838830013637393
Iteration 1 of 5 to make the marginals consistent and normalise them
Iteration 2 of 5 to make the marginals consistent and normalise them
Iteration 3 of 5 to make the marginals consistent and normalise them
Iteration 4 of 5 to make the marginals consistent and normalise them
Iteration 5 of 5 to make the marginals consistent and normalise them
Iteration 1 of 100 completed to consist the marginal views
Iteration 2 of 100 completed to consist the marginal views
Iteration 3 of 100 completed to consist the marginal views
Iteration 4 of 100 completed to consist the marginal views
Iteration 5 of 100 completed to consist the marginal views
Iteration 6 of 100 completed to consist the marginal views
Iteration 7 of 100 completed to consist the marginal views
Iteration 8 of 100 completed to consist the marginal views
Iteration 9 of 100 completed

In [15]:
#load the diabetes dataset
dl_diabetes = DataLoader('../datasets/Diabetes/diabetes.csv', '../datasets/Diabetes/data.yaml', '../datasets/Diabetes/column_info.json', '../datasets/Diabetes/loading_data.json')
dl_diabetes.data_loader()
dl_diabetes.all_indifs(dl_diabetes.private_data)

#anonymise the diabetes data
anon_diabetes = Anonymisation(epsilon=epsilon,delta=delta)
anon_diabetes.anonymiser(dl_diabetes)

#consistent the diabetes data
cons_diabetes = Consistenter(anon_diabetes, dl_diabetes.all_attrs)
cons_diabetes.make_consistent(iterations = 5)

#synthesise the diabetes data
gum_diabetes = GraduallyUpdateMethod(dl_diabetes, cons_diabetes)
gum_diabetes.initialiser(view_iterations = 100)
syn_diabetes_data = gum_diabetes.synthesize(iterations = 100, num_records = int(cons_diabetes.num_synthesize_records))

#post-process the diabetes data
processor_private_diabetes = RecordPostprocessor(dl_diabetes.private_data, dl_diabetes.configpath, dl_diabetes.datainfopath, dl_diabetes.decode_mapping)
processor_public_diabetes = RecordPostprocessor(syn_diabetes_data, dl_diabetes.configpath, dl_diabetes.datainfopath, dl_diabetes.decode_mapping)
original_diabetes_data = processor_private_diabetes.post_process()
synthesised_dp_stoa_diabetes_data = processor_public_diabetes.post_process()




  self.private_data.fillna(self.fillna, inplace=True, downcast = self.datatypes)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,4,2,2,0,1,2,3,1
1,1,3,2,1,0,1,1,1,0
2,3,5,2,0,0,1,2,1,1
3,1,3,2,1,1,1,1,0,0
4,0,4,1,2,2,1,6,1,1


epsilon: 10, delta: 3e-11, rho: 0.8616407135665874
one-way sigma: 7.226751995230954
indif sigma: 1.08e-09
multi-way sigma: 3.8088327316826844
Iteration 1 of 5 to make the marginals consistent and normalise them
Iteration 2 of 5 to make the marginals consistent and normalise them
Iteration 3 of 5 to make the marginals consistent and normalise them
Iteration 4 of 5 to make the marginals consistent and normalise them
Iteration 5 of 5 to make the marginals consistent and normalise them
Iteration 1 of 100 completed to consist the marginal views
Iteration 2 of 100 completed to consist the marginal views
Iteration 3 of 100 completed to consist the marginal views
Iteration 4 of 100 completed to consist the marginal views
Iteration 5 of 100 completed to consist the marginal views
Iteration 6 of 100 completed to consist the marginal views
Iteration 7 of 100 completed to consist the marginal views
Iteration 8 of 100 completed to consist the marginal views
Iteration 9 of 100 completed to consist t

In [22]:
# Evaluation Metric Functions
import pandas as pd
from sklearn.metrics import f1_score, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import  LabelEncoder, MinMaxScaler
from sklearn.svm import SVC
import numpy as np
from sklearn.ensemble import RandomForestRegressor

def evaluate_mle(data, categorical_cols=None):
    """
    General MLE score calculation for any dataset
    """
    if categorical_cols is None:
        categorical_cols = []
    
    # Prepare features
    X = data.copy(deep=True)
    numerical_cols = [col for col in X.columns if col not in categorical_cols]
    
    # Shuffle the data
    X = X.sample(frac=1, random_state=42).reset_index(drop=True)
    
    f1_scores = []
    r2_scores = []
    
   # Classification tasks for categorical columns
    for target_col in categorical_cols:
        # Prepare features
        features = X.drop(columns=[target_col]).copy()
        
        # One-hot encode remaining categorical features
        cat_cols = [col for col in categorical_cols if col != target_col]
        if cat_cols:
            features = pd.get_dummies(features, columns=cat_cols)
        
        # Encode target
        le = LabelEncoder()
        y = le.fit_transform(X[target_col])
        
        # Split data
        split_idx = int(0.8 * len(features))
        X_train = features.iloc[:split_idx].copy()
        X_test = features.iloc[split_idx:].copy()
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        # Scale numerical features after split
        if numerical_cols:
            scaler = MinMaxScaler()
            num_cols = [col for col in numerical_cols if col in X_train.columns]
            if num_cols:
                X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
                X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])
        
        # Train classifier
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    
    # Regression tasks for numerical columns
    for target_col in numerical_cols:
        # Prepare features
        features = X.drop(columns=[target_col]).copy()
        
        # One-hot encode categorical features
        if categorical_cols:
            features = pd.get_dummies(features, columns=categorical_cols)
        
        y = X[target_col].values
        
        # Split data
        split_idx = int(0.8 * len(features))
        X_train = features.iloc[:split_idx].copy()
        X_test = features.iloc[split_idx:].copy()
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        # Scale numerical features after split
        remaining_num_cols = [col for col in numerical_cols if col != target_col and col in X_train.columns]
        if remaining_num_cols:
            scaler = MinMaxScaler()
            X_train.loc[:, remaining_num_cols] = scaler.fit_transform(X_train[remaining_num_cols])
            X_test.loc[:, remaining_num_cols] = scaler.transform(X_test[remaining_num_cols])
        
        # Scale target
        y_scaler = MinMaxScaler()
        y_train = y_scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
        y_test = y_scaler.transform(y_test.reshape(-1, 1)).ravel()
        
        # Train regressor
        model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2_scores.append(r2_score(y_test, y_pred))
    
    # Calculate final scores
    avg_f1 = np.mean(f1_scores) if f1_scores else 0
    avg_r2 = np.mean(r2_scores) if r2_scores else 0
    mle_score = (avg_f1 + avg_r2) / 2
    
    print(f"Average F1 Score ({len(f1_scores)} tasks): {avg_f1:.4f}")
    print(f"Average R² Score ({len(r2_scores)} tasks): {avg_r2:.4f}")
    print(f"MLE Score: {mle_score:.4f}")
    
    return mle_score

# MLE Score for original data
print("Original Adult Dataset MLE Evaluation:")
mle_score = evaluate_mle(
    data=original_adult_data,
    categorical_cols=['workclass', 'education', 'marital-status', 'occupation', 
                     'relationship', 'race', 'sex', 'native-country', 'income']
)

# MLE Score for synthetic data
print("\nSynthesised DP STOA Adult Dataset MLE Evaluation:")
mle_score_synthetic = evaluate_mle(
    data=synthesised_dp_stoa_adult_data,
    categorical_cols=['workclass', 'education', 'marital-status', 'occupation', 
                     'relationship', 'race', 'sex', 'native-country', 'income']
)

# MLE Score for original diabetes data
print("\nOriginal Diabetes Dataset MLE Evaluation:")
mle_score_original_diabetes = evaluate_mle(
    data=original_diabetes_data,
    categorical_cols=[]
)

# MLE Score for synthesised dp stoa diabetes data
print("\nSynthesised DP STOA Diabetes Dataset MLE Evaluation:")
mle_score_synthetic_diabetes = evaluate_mle(
    data=synthesised_dp_stoa_diabetes_data,
    categorical_cols=[]
)


Original Adult Dataset MLE Evaluation:
Average F1 Score (9 tasks): 0.4376
Average R² Score (4 tasks): 0.1028
MLE Score: 0.2702

Synthesised DP STOA Adult Dataset MLE Evaluation:
Average F1 Score (9 tasks): 0.8483
Average R² Score (4 tasks): 0.7255
MLE Score: 0.7869

Original Diabetes Dataset MLE Evaluation:
Average F1 Score (0 tasks): 0.0000
Average R² Score (9 tasks): 0.0509
MLE Score: 0.0254

Synthesised DP STOA Diabetes Dataset MLE Evaluation:
Average F1 Score (0 tasks): 0.0000
Average R² Score (9 tasks): 0.4924
MLE Score: 0.2462
