In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def check_k_anonymity(df, quasi_identifiers, k):
    group_sizes = df.groupby(quasi_identifiers).size()
    num_partitions = len(group_sizes)
    total_rows = len(df)
    
    is_k_anonymous = ((group_sizes >= k)).all()

    
    if is_k_anonymous:
        print(f"Il dataset rispetta la k-anonimity per k = {k} ")
    else:
        print(f"Il dataset non rispetta la k-anonimity per k = {k} ")
        print("\nLe seguenti partizioni non rispettano la k-anonimity:")
        print(group_sizes[(group_sizes < k) | (group_sizes > k*2)])
    
    return is_k_anonymous

anonymized_df = pd.read_csv('anonymized.csv')
original_df = pd.read_csv('generation/database.csv')   

quasi_identifiers = ['age', 'gender', 'city', 'education', 'profession']
k = 3  
check_k_anonymity(anonymized_df, quasi_identifiers, k)


Il dataset rispetta la k-anonimity per k = 3 


True

In [2]:
numerical_qis = ['age']
ordinal_qis = ['education', 'gender']
categorical_qis = ['city', 'profession']
all_qis = categorical_qis + numerical_qis + ordinal_qis

Original dataset age average and anonymized dataset age average: 

In [3]:
import numpy as np

def calculate_mean(column):
    try:
        col_range = column.strip('[]')
        # Dividiamo il range in due numeri
        col_split = col_range.split('-')
        return np.mean([int(col_split[0]), int(col_split[1])])
    except (IndexError, ValueError):
        return np.nan


anonymized_df['age_mean'] = anonymized_df['age'].apply(calculate_mean)

# Calcoliamo la media dell'età in original_df
original_age_mean = original_df['age'].mean()

print("Media dei range di età in anonymized_df:")
print(anonymized_df[['age', 'age_mean']])
print("\nMedia complessiva in anonymized_df :", anonymized_df['age_mean'].mean())

print("\nMedia dell'età in original_df:")
print(original_age_mean)


Media dei range di età in anonymized_df:
         age  age_mean
0    [20-66]      43.0
1    [20-66]      43.0
2    [20-66]      43.0
3    [22-66]      44.0
4    [22-66]      44.0
..       ...       ...
995  [24-86]      55.0
996  [24-86]      55.0
997  [27-67]      47.0
998  [27-67]      47.0
999  [27-67]      47.0

[1000 rows x 2 columns]

Media complessiva in anonymized_df : 53.003

Media dell'età in original_df:
52.984


Education level average e standard deviation:  

In [4]:
import json
import pandas as pd

with open('generation\\json\\educations.json', 'r') as f:
    education_levels = json.load(f)

def create_mapping(levels, start_value=1):  # Partiamo da 1, escludendo "ANY-EDUCATION"
    mapping = {}
    def recursive_map(levels, current_value):
        for level in levels:
            if 'categories' in level and level['categories']:
                mapping[level['name']] = current_value
                for sub_level in level['categories']:
                    mapping[sub_level['name']] = current_value
                current_value += 1
            else:
                mapping[level['name']] = current_value
                current_value += 1
        return current_value

    recursive_map(levels['categories'], start_value)
    return mapping

education_mapping = create_mapping(education_levels)

print("Mappatura dei livelli di istruzione:")
for key, value in education_mapping.items():
    print(f"{key}: {value}")

df = pd.read_csv('generation\\database.csv')

if 'education' not in df.columns:
    raise KeyError("'education' column not found in the dataset")

df['education'] = df['education'].map(education_mapping)

df_filtered = df[df['education'].notna()]

mean_education = df_filtered['education'].mean()
var_education = df_filtered['education'].var()

print(f"\nMedia del livello di istruzione: {mean_education}")
print(f"Varianza del livello di istruzione: {var_education}")

Mappatura dei livelli di istruzione:
High School: 1
Bachelor's Degree: 2
Graduate School: 3
Master's Degree: 3
Doctoral Degree: 3

Media del livello di istruzione: 1.749
Varianza del livello di istruzione: 0.7247237237237237


Gender average e standard deviation:  

In [5]:
import pandas as pd

gender_mapping = {
    "male": 0,
    "female": 1
}

df = pd.read_csv('generation/database.csv')

if 'gender' not in df.columns:
    raise KeyError("'gender' column not found in the dataset")

df['gender'] = df['gender'].map(gender_mapping)

mean_gender = df['gender'].mean()
var_gender = df['gender'].var()

print(f"Media del genere: {mean_gender}")
print(f"Varianza del genere: {var_gender}")


Media del genere: 0.517
Varianza del genere: 0.249960960960961


In [6]:
def mapping(col):
    if col == 'education':
        return education_mapping
    elif col == 'gender':
        return gender_mapping

Complete Statistical Analysis:

In [7]:
def statistical_analysis(original_df, anonymized_df, numerical_qis, categorical_qis, ordinal_qis):
   
    original_df = original_df[all_qis]
    anonymized_df = anonymized_df[all_qis]

    print("Analisi delle colonne numeriche:")
    for col in numerical_qis:
        anonymized_df[col] = anonymized_df[col].apply(calculate_mean)
        
        mean_original = original_df[col].mean()
        mean_anonymized = anonymized_df[col].mean()
        std_original = original_df[col].std()
        std_anonymized = anonymized_df[col].std()

        print(f"Colonna: {col}")
        print(f"  Media originale: {mean_original}")
        print(f"  Media anonimizzata: {mean_anonymized}")
        print(f"  Deviazione standard originale: {std_original}")
        print(f"  Deviazione standard anonimizzata: {std_anonymized}")
        
    print("Analisi delle colonne categoriche ordinali:")
    for col in ordinal_qis:
        col_mapping = mapping(col)
        anonymized_df[col] = anonymized_df[col].map(col_mapping)
        original_df[col] = original_df[col].map(col_mapping)
        
        anonymized_filtered = anonymized_df[anonymized_df[col].notna()]
        original_filtered = original_df[original_df[col].notna()]
        
        mean_anonymized = anonymized_filtered[col].mean()
        std_anonymized = anonymized_filtered[col].var()
        mean_original = original_filtered[col].mean()
        std_original = original_filtered[col].var()
        
        print(f"Colonna: {col}")
        print(f"  Media originale: {mean_original}")
        print(f"  Media anonimizzata: {mean_anonymized}")
        print(f"  Deviazione standard originale: {std_original}")
        print(f"  Deviazione standard anonimizzata: {std_anonymized}")

In [8]:
statistical_analysis(original_df, anonymized_df, numerical_qis, categorical_qis, ordinal_qis)

Analisi delle colonne numeriche:
Colonna: age
  Media originale: 52.984
  Media anonimizzata: 53.003
  Deviazione standard originale: 20.807996460897353
  Deviazione standard anonimizzata: 11.437648832291137
Analisi delle colonne categoriche ordinali:
Colonna: education
  Media originale: 1.749
  Media anonimizzata: 1.1153846153846154
  Deviazione standard originale: 0.7247237237237237
  Deviazione standard anonimizzata: 0.14143920595533505
Colonna: gender
  Media originale: 0.517
  Media anonimizzata: 0.5172413793103449
  Deviazione standard originale: 0.249960960960961
  Deviazione standard anonimizzata: 0.25066312997347473


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anonymized_df[col] = anonymized_df[col].apply(calculate_mean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anonymized_df[col] = anonymized_df[col].map(col_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_df[col] = original_df[col].map(col_mapping)
A value is trying to be set on a 