## 3.3 Experiments – 20 marks

### 3.3.1 Design experiments to test the following: The utility of the data that you have generated using your proposed anonymisation scheme (algorithms) for Q2.c.

First, we start with preprocessing measures to ensure we can compare the utility of our data accordingly. 

In [301]:
import pandas as pd
import numpy as np 

def interval_to_middle(value):
    if pd.isna(value):
        return np.nan  
    start, end = value.split('-')
    start = int(start)
    end = int(end)
    # Calculate the middle value of the interval
    middle = (start + end) / 2
    return middle

# Apply preprocessing measures to be able to compare our two datasets
orig_df = pd.read_csv("police-shooting.csv")
orig_df = orig_df.drop(['city', 'name', 'longitude', 'latitude', 'is_geocoding_exact', 'id'], axis=1)

orig_df['year'] = pd.to_datetime(orig_df['date']).dt.year
orig_df = orig_df.drop(['date'], axis=1)

anon_df = pd.read_csv("k_anon_police.csv")
anon_df['year'] = anon_df['year_range'].apply(interval_to_middle)
anon_df['age'] = anon_df['age_range'].apply(interval_to_middle)
anon_df = anon_df.drop(['year_range', 'age_range', 'id'], axis=1)

print(anon_df.iloc[:5])

    manner_of_death       armed gender race state  signs_of_mental_illness  \
0              shot         gun      M    A    WA                     True   
1              shot         gun      M    W    OR                    False   
2  shot and Tasered     unarmed      M    H    KS                    False   
3              shot  toy weapon      M    W    CA                     True   
4              shot    nail gun      M    H    CO                    False   

  threat_level         flee  body_camera    year   age  
0       attack  Not fleeing        False  2016.5  45.5  
1       attack  Not fleeing        False  2016.5  45.5  
2        other  Not fleeing        False  2016.5  15.0  
3       attack  Not fleeing        False  2016.5  45.5  
4       attack  Not fleeing        False  2016.5  45.5  


In [302]:
print(orig_df.iloc[:5])

    manner_of_death       armed   age gender race state  \
0              shot         gun  53.0      M    A    WA   
1              shot         gun  47.0      M    W    OR   
2  shot and Tasered     unarmed  23.0      M    H    KS   
3              shot  toy weapon  32.0      M    W    CA   
4              shot    nail gun  39.0      M    H    CO   

   signs_of_mental_illness threat_level         flee  body_camera  year  
0                     True       attack  Not fleeing        False  2015  
1                    False       attack  Not fleeing        False  2015  
2                    False        other  Not fleeing        False  2015  
3                     True       attack  Not fleeing        False  2015  
4                    False       attack  Not fleeing        False  2015  


When preprocessing is done, we compare the cardinality of each of the attributes.

In [303]:
cardinalities_orig = {}
cardinalities_anon = {}

for column in orig_df.columns:
    u = orig_df[column].nunique() 
    n = orig_df[column].count()  
    c = u / n  # Cardinality calculation
    cardinalities_orig[column] = c
    
for column in anon_df.columns:
    u = anon_df[column].nunique() 
    n = anon_df[column].count()  
    c = u / n  # Cardinality calculation
    cardinalities_anon[column] = c

    
df_orig_cardinalities = pd.DataFrame(list(cardinalities_orig.items()), columns=['Attribute', 'Original Cardinality'])
df_anon_cardinalities = pd.DataFrame(list(cardinalities_anon.items()), columns=['Attribute', 'Anonymized Cardinality'])
combined_cardinalities = pd.merge(df_orig_cardinalities, df_anon_cardinalities, on='Attribute')
print(combined_cardinalities.to_string(index=False))

              Attribute  Original Cardinality  Anonymized Cardinality
        manner_of_death              0.000250                0.000250
                  armed              0.013628                0.013628
                    age              0.010816                0.000401
                 gender              0.000251                0.000251
                   race              0.000927                0.000927
                  state              0.006384                0.006384
signs_of_mental_illness              0.000250                0.000250
           threat_level              0.000376                0.000376
                   flee              0.000569                0.000569
            body_camera              0.000250                0.000250
                   year              0.001001                0.000376


Then, we proceed by displaying the minimum value, mean value, 25th percentile value, 75th percentile value, standard derivation and the total value count for the numerical values in both of the datasets.

In [304]:
stats_original = orig_df.describe().transpose()
stats_anonymized = anon_df.describe().transpose()

stats_original.columns = ['orig_' + col for col in stats_original.columns]
stats_anonymized.columns = ['anon_' + col for col in stats_anonymized.columns]

combined_stats = pd.merge(stats_original, stats_anonymized, left_index=True, right_index=True)

combined_stats

Unnamed: 0,orig_count,orig_mean,orig_std,orig_min,orig_25%,orig_50%,orig_75%,orig_max,anon_count,anon_mean,anon_std,anon_min,anon_25%,anon_50%,anon_75%,anon_max
age,7489.0,37.215917,12.986545,2.0,27.0,35.0,45.0,92.0,7489.0,38.072573,17.868025,15.0,15.0,45.5,45.5,80.5
year,7989.0,2018.536863,2.290178,2015.0,2017.0,2019.0,2021.0,2022.0,7989.0,2019.483352,2.55659,2016.5,2016.5,2020.0,2023.0,2023.0


We now progress in comparing if there is a difference in the number of missing values in the two datasets. 

In [305]:
missing_original = orig_df.isnull().sum()
missing_anonymized = anon_df.isnull().sum()

original_missing_percentage = (missing_original / len(orig_df)) * 100
anonymized_missing_percentage = (missing_anonymized / len(anon_df)) * 100

completeness_comparison = pd.DataFrame({
    'original_missing_perc': original_missing_percentage,
    'anonymized_missing_percentage': anonymized_missing_percentage
})

completeness_comparison

Unnamed: 0,original_missing_perc,anonymized_missing_percentage
age,6.258606,6.258606
armed,2.641132,2.641132
body_camera,0.0,0.0
flee,11.978971,11.978971
gender,0.388034,0.388034
manner_of_death,0.0,0.0
race,18.976092,18.976092
signs_of_mental_illness,0.0,0.0
state,0.0,0.0
threat_level,0.0,0.0


We now proceed with calculating the pearson coefficient in numerical attributes of our two datasets. The Pearson correlation coefficient measures the linear correlation between two sets of data, ranging from -1 to 1. A coefficient of 1 indicates a perfect positive linear relationship, -1 indicates a perfect negative linear relationship, and 0 indicates no linear relationship.

In [306]:
from scipy.stats import pearsonr
# Initialize an empty dictionary to store the correlations
correlations = {}

# Iterate through each column in the original DataFrame
for column in orig_df.columns:
    # Check if the column exists in the anonymized DataFrame and is numeric
    if column in anon_df.columns and orig_df[column].dtype in [np.float64, np.int64, np.int32]:
        # Combine the current column's data from both DataFrames and drop missing values
        combined = pd.concat([orig_df[column], anon_df[column]], axis=1, keys=['original', 'anonymized']).dropna()
        # Calculate the Pearson correlation coefficient for the combined data
        correlation = pearsonr(combined['original'], combined['anonymized'])[0]
        # Store the correlation coefficient in the dictionary
        correlations[column] = correlation

correlations

{'age': 0.8408002425947383, 'year': 0.9447358199395686}

In [307]:
def correlation_analysis(original, anonymized):
    correlations = {}
    for column in original.columns:
        if column in anonymized.columns and original[column].dtype in [np.float64, np.int64, np.int32]:
            combined = pd.concat([original[column], anonymized[column]], axis=1, keys=['original', 'anonymized']).dropna()
            correlation = pearsonr(combined['original'], combined['anonymized'])[0]
            correlations[column] = correlation
    return correlations

import numpy as np


# Initialize dictionaries to store results
similarity_results = {}
mae_results = {}
rmse_results = {}

# Compute correlations - If correlation_analysis is not defined, you could use pandas' built-in method
# For example, if correlation_analysis calculates Pearson correlation:
correlations = orig_df.corrwith(anon_df, method='pearson')
similarity_results['pearson_correlation'] = correlations

# Loop through columns to calculate MAE and RMSE for numeric columns
for column in orig_df.columns:
    if column in anon_df.columns and orig_df[column].dtype in [np.float64, np.int64, np.int32]:
        original_col_data = orig_df[column].dropna()
        anonymized_col_data = anon_df.loc[original_col_data.index, column]
        mae = np.mean(np.abs(original_col_data - anonymized_col_data))
        rmse = np.sqrt(np.mean((original_col_data - anonymized_col_data) ** 2))
        mae_results[column] = mae
        rmse_results[column] = rmse

# Compile final results
final_results = {'MAE': mae_results, 'RMSE': rmse_results}

# Output results
final_results

TypeError: can only concatenate str (not "int") to str

### 3.3.2 Design experiments to test the following: Analyse the new (anonymized) dataset for risks of de-anonymization.
We are using the same Algorithm proposed in Task 3.2

In [None]:
import pandas as pd
from scipy import stats
import numpy as np
import prince
import warnings
import altair as alt
alt.data_transformers.enable("vegafusion")

df = pd.read_csv('k_anon_police.csv')
# Ignore specific PerformanceWarnings from pandas
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
# Load the dataset

mca = prince.MCA(
    n_components=3,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='sklearn',
    random_state=42
)
# Fit FAMD on the dataset
mca = mca.fit(df)

df_transformed = mca.transform(df)
z_scores = np.abs(stats.zscore(df_transformed))
outliers = np.where(z_scores > 3) 

outlier_rows = df.iloc[outliers[0]]
outlier_rows.to_csv('athletes_outliers_MCA.csv', index=False)

mca.plot(
    df,
    x_component=0,
    y_component=1,
    show_column_markers=True,
    show_row_markers=True,
    show_column_labels=False,
    show_row_labels=False
)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))  
plt.scatter(df_transformed[0], df_transformed[1]) 

plt.title('FAMD Results')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True)

plt.show() 

### 3.3.3 Design experiments to test the following: Propose a method of assessing the risk of disclosure (de-anonymisation) and use this metric to evaluate your anonymised datasets (from Assignments #1, and #2-3), the anonymised dataset received from your colleague, and your version of the anonymised dataset that you obtained in Q2.c.
