In [1]:
import pandas as pd

def check_k_anonymity(df, quasi_identifiers, k):
    group_sizes = df.groupby(quasi_identifiers).size()
       
    is_k_anonymous = (group_sizes >= k).all()

    if is_k_anonymous:
        print(f"Dataset k-anonymized with k = {k} ")
    else:
        print(f"Dataset does not respet k-anonymity with k = {k}")
        print("\nThese partitions does not respect k-anonymity:")
        print(group_sizes[(group_sizes < k)])
    
    return is_k_anonymous

anonymized_df = pd.read_csv('anonymized.csv')
original_df = pd.read_csv('generation/database.csv')     

quasi_identifiers = ['age', 'gender', 'city', 'education', 'profession']
k = 3  
check_k_anonymity(anonymized_df, quasi_identifiers, k)


Dataset k-anonymized with k = 3 


True

In [2]:
numerical_qis = ['age']
ordinal_qis = ['education', 'gender']
categorical_qis = ['city', 'profession']
all_qis = categorical_qis + numerical_qis + ordinal_qis

Original dataset age average and anonymized dataset age average: 

In [3]:
import numpy as np

def calculate_mean(column):
    try:
        col_no_brackets = column.strip('[]')
        if '-' in col_no_brackets:
            # statistic was a range so we calculate the mean
            col_split = col_no_brackets.split('-')
            return np.mean([int(col_split[0]), int(col_split[1])])
        else:
            # statistic was mean so it is a float
            return float(col_no_brackets)
    except (IndexError, ValueError):
        return np.nan


anonymized_df['age_mean'] = anonymized_df['age'].apply(calculate_mean)
anonymized_df_mean = anonymized_df['age_mean'].mean()

original_age_mean = original_df['age'].mean()

print("Media dei range di età in anonymized_df:")
print(anonymized_df[['age', 'age_mean']])
print("\nMedia complessiva in anonymized_df :", anonymized_df_mean)

print("\nMedia dell'età in original_df:")
print(original_age_mean)


Media dei range di età in anonymized_df:
         age  age_mean
0    [66-81]      73.5
1    [66-81]      73.5
2    [66-81]      73.5
3    [66-81]      73.5
4    [66-81]      73.5
..       ...       ...
995  [59-82]      70.5
996  [59-82]      70.5
997  [22-81]      51.5
998  [22-81]      51.5
999  [22-81]      51.5

[1000 rows x 2 columns]

Media complessiva in anonymized_df : 54.2115

Media dell'età in original_df:
54.464


Education level average e standard deviation:  

In [4]:
import json
import pandas as pd

with open('generation\\json\\educations.json', 'r') as f:
    education_levels = json.load(f)

def create_mapping(levels, start_value=1):  # starting from 1, so we exclude ANY in json files"
    mapping = {}
    def recursive_map(levels, current_value):
        for level in levels:
            if 'categories' in level and level['categories']:
                mapping[level['name']] = current_value
                for sub_level in level['categories']:
                    mapping[sub_level['name']] = current_value
                current_value += 1
            else:
                mapping[level['name']] = current_value
                current_value += 1
        return current_value

    recursive_map(levels['categories'], start_value)
    return mapping

education_mapping = create_mapping(education_levels)

print("Mapping of educations:")
for key, value in education_mapping.items():
    print(f"{key}: {value}")

df = pd.read_csv('generation\\database.csv')

if 'education' not in df.columns:
    raise KeyError("'education' column not found in the dataset")

df['education'] = df['education'].map(education_mapping)

df_filtered = df[df['education'].notna()]

mean_education = df_filtered['education'].mean()
var_education = df_filtered['education'].var()

print(f"\nMean of educations: {mean_education}")
print(f"Variance of educations: {var_education}")

Mapping of educations:
High School: 1
Bachelor's Degree: 2
Graduate School: 3
Master's Degree: 3
Doctoral Degree: 3

Mean of educations: 1.751
Variance of educations: 0.7357347347347346


Gender average e standard deviation:  

In [5]:
import pandas as pd

gender_mapping = {
    "male": 0,
    "female": 1
}

df = pd.read_csv('generation/database.csv')

if 'gender' not in df.columns:
    raise KeyError("'gender' column not found in the dataset")

df['gender'] = df['gender'].map(gender_mapping)

mean_gender = df['gender'].mean()
var_gender = df['gender'].var()

print(f"Mean of gender: {mean_gender}")
print(f"Variance of gender: {var_gender}")


Mean of gender: 0.515
Variance of gender: 0.250025025025025


In [6]:
def mapping(col):
    if col == 'education':
        return education_mapping
    elif col == 'gender':
        return gender_mapping

Complete Statistical Analysis:

In [7]:
def statistical_analysis(original_df, anonymized_df, numerical_qis, categorical_qis, ordinal_qis):
   
    original_df = original_df[all_qis]
    anonymized_df = anonymized_df[all_qis]

    print("Analysis of numerical columns:")
    for col in numerical_qis:
        anonymized_df[col] = anonymized_df[col].apply(calculate_mean)
        
        mean_original = original_df[col].mean()
        mean_anonymized = anonymized_df[col].mean()
        std_original = original_df[col].std()
        std_anonymized = anonymized_df[col].std()

        print(f"Column: {col}")
        print(f"  Original mean: {mean_original}")
        print(f"  Anonymized mean: {mean_anonymized}")
        print(f"  Original standard deviation: {std_original}")
        print(f"  Anonymized standard deviation: {std_anonymized}")
        
    print("Analysis of ordinal categorical columns:")
    for col in ordinal_qis:
        col_mapping = mapping(col)
        anonymized_df[col] = anonymized_df[col].map(col_mapping)
        original_df[col] = original_df[col].map(col_mapping)
        
        anonymized_filtered = anonymized_df[anonymized_df[col].notna()]
        original_filtered = original_df[original_df[col].notna()]
        
        mean_anonymized = anonymized_filtered[col].mean()
        std_anonymized = anonymized_filtered[col].var()
        mean_original = original_filtered[col].mean()
        std_original = original_filtered[col].var()
        
        print(f"Column: {col}")
        print(f"  Original mean: {mean_original}")
        print(f"  Anonymized mean: {mean_anonymized}")
        print(f"  Original variance: {std_original}")
        print(f"  Anonymized variance: {std_anonymized}")


In [8]:
statistical_analysis(original_df, anonymized_df, numerical_qis, categorical_qis, ordinal_qis)

Analysis of numerical columns:
Column: age
  Original mean: 54.464
  Anonymized mean: 54.2115
  Original standard deviation: 18.735253233242236
  Anonymized standard deviation: 10.013330341826068
Analysis of ordinal categorical columns:
Column: education
  Original mean: 1.751
  Anonymized mean: 1.1724137931034482
  Original variance: 0.7357347347347346
  Anonymized variance: 0.212876220849113
Column: gender
  Original mean: 0.515
  Anonymized mean: 0.563265306122449
  Original variance: 0.250025025025025
  Anonymized variance: 0.24700568752091007


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anonymized_df[col] = anonymized_df[col].apply(calculate_mean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anonymized_df[col] = anonymized_df[col].map(col_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_df[col] = original_df[col].map(col_mapping)
A value is trying to be set on a 