# Simple Reidentifaction Attack

Simple example that performs a reidentification attack on network data and synthetically generated network data.<br>
Example based on smarnoise-samples/whitepaper-demos

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
# path to the folder containg data csv
dataPath = '../data/New_NETWORK_TEST/NEW_NETWORK_TEST/'

In [3]:
# Read files
# assume attack has the entire original network dataset
def readCSV(data, drop=None):
    # data is the path of the csv file
    try:
        df_data = pd.read_csv(data, sep=",", encoding="utf-8").infer_objects()
        if drop:
            try:
                df_data = df_data.drop(drop, axis=1)
            except:
                print(f'Cannot drop column {drop} from dataframe. Column {drop} not found')

    except Exception as e:
        print(e)
        df_data = None
    
    return df_data

In [4]:
# Given a df of indices of potential matches (matches), the original and synthethic data,
# return a dataframe of the appropriate rows from original and synth
def get_matches(matches, original, synth):
    col = pd.concat([pd.Series('index_original'), pd.Series(original.columns+'_original'), pd.Series('index_synth'), pd.Series(synth.columns+'_synth')]) 
    match_data = pd.DataFrame(columns=col)
    for index, row in matches.iterrows():
        original_row = [row[0]] + original.iloc[row[0],:].tolist()
        synth_row = [row[1]] + synth.iloc[row[1],:].tolist()       
        temp = pd.DataFrame([original_row + synth_row], columns=col)
        match_data = pd.concat([match_data, temp])
        
    return match_data

In [23]:
def try_reidentification_noise(synth, net):
    col = pd.concat([pd.Series('index_original'), pd.Series(net.columns+'_original'), pd.Series('index_synth'), pd.Series(synth.columns+'_synth')]) 
   
    reident_50 = pd.DataFrame(columns=['index_original', 'index_synth']) # 50% match
    reident_75 = pd.DataFrame(columns= ['index_original', 'index_synth']) # 75% match
    reident_100 = pd.DataFrame(columns= ['index_original', 'index_synth']) # 100% match
    
    num_columns = len(net.columns)
    percent_50 = int(np.round(num_columns/2)) # num of matched columns for 50%
    percent_75 = int(np.round((3*num_columns)/4)) # num of matched columns for 75%

    for index1, row1 in tqdm(net.iterrows(), total=net.shape[0]):
            # Here list all columns that are needed for reidentification
            # will try to find a match between rows in the synthetic data and the information
            # the attacker has (original network dataset)
            for index2, row2 in synth.iterrows():
                # get the num of columns that match for row1 and row2
                # with wildcard (nan) matching included
                def calculate_match(row1, row2):
                    return sum([
                        int((pd.isna(row1[col])) or (row1[col] == row2[col]))
                        for col in row1.index
                        ])
                #match = sum((row1 == row2).astype(int).tolist())
                match = calculate_match(row1, row2)
                
                # determine possible reidentification
                # Don't double count, i.e. match of 75% will not appera as match of > 25%
                if match == num_columns:
                    temp = pd.DataFrame([[index1, index2]], columns=['index_original', 'index_synth'])
                    reident_100 = pd.concat([reident_100, temp])
                elif match >= percent_75:
                    temp = pd.DataFrame([[index1, index2]], columns=['index_original', 'index_synth'])
                    reident_75 = pd.concat([reident_75, temp])
                elif match >= percent_50:
                    temp = pd.DataFrame([[index1, index2]], columns=['index_original', 'index_synth'])
                    reident_50 = pd.concat([reident_50, temp])
    
    print(f"Identified {len(reident_100)} potential matches (100%)!")
    print(f"Identified {len(reident_75)} potential matches (75%)!")
    print(f"Identified {len(reident_50)} potential matches (50%)!")
    return reident_100, reident_75, reident_50

In [24]:
original = readCSV(dataPath+'original.csv', drop='Attack')

### ctgan_1 Model

In [25]:
ctgan_1 = readCSV(dataPath+'ctgan_1.csv', drop='Attack')
ctgan_1_attack_100, ctgan_1_attack_75, ctgan_1_attack_50 = try_reidentification_noise(ctgan_1, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

Identified 0 potential matches (100%)!
Identified 103 potential matches (75%)!
Identified 13625 potential matches (50%)!


In [26]:
ct_gan_1_attack_50_matches = get_matches(ctgan_1_attack_50, original, ctgan_1)
ct_gan_1_attack_75_matches = get_matches(ctgan_1_attack_75, original, ctgan_1)
ct_gan_1_attack_100_matches = get_matches(ctgan_1_attack_100, original, ctgan_1)

### ctgan_2 Model

In [27]:
ctgan_2 = readCSV(dataPath+'ctgan_2.csv')
ctgan_2_attack_100, ctgan_2_attack_75, ctgan_2_attack_50 = try_reidentification_noise(ctgan_2, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

Identified 0 potential matches (100%)!
Identified 217 potential matches (75%)!
Identified 27888 potential matches (50%)!


In [28]:
ctgan_2_attack_50_matches = get_matches(ctgan_2_attack_50, original, ctgan_2)
ctgan_2_attack_75_matches = get_matches(ctgan_2_attack_75, original, ctgan_2)
ctgan_2_attack_100_matches = get_matches(ctgan_2_attack_100, original, ctgan_2)

### ctgan_dp_clip Model

In [29]:
ctgan_dp_clip = readCSV(dataPath+'ctgan_dp_clip.csv')
ctgan_dp_clip_attack_100, ctgan_dp_clip_attack_75, ctgan_dp_clip_attack_50 = try_reidentification_noise(ctgan_dp_clip, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

Identified 0 potential matches (100%)!
Identified 81 potential matches (75%)!
Identified 12549 potential matches (50%)!


In [30]:
ctgan_dp_clip_attack_50_matches = get_matches(ctgan_dp_clip_attack_50, original, ctgan_dp_clip)
ctgan_dp_clip_attack_75_matches = get_matches(ctgan_dp_clip_attack_75, original, ctgan_dp_clip)
ctgan_dp_clip_attack_100_matches = get_matches(ctgan_dp_clip_attack_100, original, ctgan_dp_clip)

### ctgan_dp_gan Model

Did not Run: Very slow due to the size of the file

In [None]:
ctgan_dp_gan = readCSV(dataPath+'ctgan_dp_gan.csv')
ctgan_dp_gan_attack_100, ctgan_dp_gan_attack_75, ctgan_dp_gan_attack_50 = try_reidentification_noise(ctgan_dp_gan, original)

In [None]:
ctgan_dp_gan_attack_50_matches = get_matches(ctgan_dp_gan_attack_50, original, ctgan_dp_gan)
ctgan_dp_gan_attack_75_matches = get_matches(ctgan_dp_gan_attack_75, original, ctgan_dp_gan)
ctgan_dp_gan_attack_100_matches = get_matches(ctgan_dp_gan_attack_100, original, ctgan_dp_gan)

### ctgan_dp_sgd Model

Did not Run: very slow due to the size of the file

In [None]:
ctgan_dp_sgd = readCSV(dataPath+'ctgan_dp_sgd.csv')
ctgan_dp_sgd_attack_100, ctgan_dp_sgd_attack_75, ctgan_dp_sgd_attack_50 = try_reidentification_noise(ctgan_dp_sgd, original)

In [None]:
ctgan_dp_sgd_attack_50_matches = get_matches(ctgan_dp_sgd_attack_50, original, ctgan_dp_sgd)
ctgan_dp_sgd_attack_75_matches = get_matches(ctgan_dp_sgd_attack_75, original, ctgan_dp_sgd)
ctgan_dp_sgd_attack_100_matches = get_matches(ctgan_dp_sgd_attack_100, original, ctgan_dp_sgd)

### kg_ctgan Model

In [31]:
kg_ctgan = readCSV(dataPath+'kg_ctgan.csv')
kg_ctgan_attack_100, kg_ctgan_attack_75, kg_ctgan_attack_50 = try_reidentification_noise(kg_ctgan, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

Identified 0 potential matches (100%)!
Identified 1248 potential matches (75%)!
Identified 77436 potential matches (50%)!


In [32]:
kg_ctgan_attack_50_matches = get_matches(kg_ctgan_attack_50, original, kg_ctgan)
kg_ctgan_attack_75_matches = get_matches(kg_ctgan_attack_75, original, kg_ctgan)
kg_ctgan_attack_100_matches = get_matches(kg_ctgan_attack_100, original, kg_ctgan)

### Octgan Model

In [33]:
octgan = readCSV(dataPath+'octgan.csv')
octgan_attack_100, octgan_attack_75, octgan_attack_50 = try_reidentification_noise(octgan, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

Identified 0 potential matches (100%)!
Identified 810 potential matches (75%)!
Identified 65779 potential matches (50%)!


In [34]:
octgan_attack_50_matches = get_matches(octgan_attack_50, original, octgan)
octgan_attack_75_matches = get_matches(octgan_attack_75, original, octgan)
octgan_attack_100_matches = get_matches(octgan_attack_100, original, octgan)

### pategan Model

In [35]:
pategan = readCSV(dataPath+'pategan.csv')
pategan_attack_100, pategan_attack_75, pategan_attack_50 = try_reidentification_noise(pategan, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

Identified 0 potential matches (100%)!
Identified 121 potential matches (75%)!
Identified 13553 potential matches (50%)!


In [36]:
pategan_attack_50_matches = get_matches(pategan_attack_50, original, pategan)
pategan_attack_75_matches = get_matches(pategan_attack_75, original, pategan)
pategan_attack_100_matches = get_matches(pategan_attack_100, original, pategan)

### tablegan Model

In [37]:
tablegan = readCSV(dataPath+'tablegan.csv')
tablegan_attack_100, tablegan_attack_75, tablegan_attack_50 = try_reidentification_noise(tablegan, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

Identified 0 potential matches (100%)!
Identified 428 potential matches (75%)!
Identified 51065 potential matches (50%)!


In [38]:
tablegan_attack_50_matches = get_matches(tablegan_attack_50, original, tablegan)
tablegan_attack_75_matches = get_matches(tablegan_attack_75, original, tablegan)
tablegan_attack_100_matches = get_matches(tablegan_attack_100, original, tablegan)

### tvae_1 Model

In [39]:
tvae_1 = readCSV(dataPath+'tvae_1.csv')
tvae_1_attack_100, tvae_1_attack_75, tvae_1_attack_50 = try_reidentification_noise(tvae_1, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

Identified 0 potential matches (100%)!
Identified 151111 potential matches (75%)!
Identified 85524 potential matches (50%)!


In [None]:
tvae_1_attack_50_matches = get_matches(tvae_1_attack_50, original, tvae_1)
tvae_1_attack_75_matches = get_matches(tvae_1_attack_75, original, tvae_1)
tvae_1_attack_100_matches = get_matches(tvae_1_attack_100, original, tvae_1)

### tvae_2 Model

In [41]:
tvae_2 = readCSV(dataPath+'tvae_2.csv')
tvae_2_attack_100, tvae_2_attack_75, tvae_2_attack_50 = try_reidentification_noise(tvae_2, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

Identified 0 potential matches (100%)!
Identified 258031 potential matches (75%)!
Identified 225823 potential matches (50%)!


In [None]:
tvae_2_attack_50_matches = get_matches(tvae_2_attack_50, original, tvae_2)
tvae_2_attack_75_matches = get_matches(tvae_2_attack_75, original, tvae_2)
tvae_2_attack_100_matches = get_matches(tvae_2_attack_100, original, tvae_2)

### OLD Code

In [14]:
def try_reidentification_noise2(synth, net):
    reident = pd.DataFrame(columns = synth.columns)
    net_data = net.copy()
    for index, row in tqdm(net_data.iterrows(), total=net_data.shape[0]):
        # Here list all columns that are needed for reidentification
        # will try to find a match between rows in the synthetic data and the information
        # the attacker has (original network dataset)
        filtered = synth.loc[(synth['Protocol'] == row['Protocol']) & (synth['Destination Port'] == row['Destination Port'])
                             & (synth['Source Port'] == row['Source Port']) & (synth['Destination'] == row['Destination'])
        ]#& (synth['Time'] == row['Time'])]
        # potential match
        if len(filtered) != 0:
            reident = pd.concat([reident,filtered])
        print(filtered)
        break

    print(f"Identified {len(reident)} potential matches!")
    return reident

In [5]:
# Perform the attack - No matches found
reident_attack = try_reidentification_noise2(df_synth, df_network)

100%|██████████| 778792/778792 [6:52:28<00:00, 31.47it/s]       


Identified 0 potential matches!


In [6]:
print(reident_attack)

Empty DataFrame
Columns: [No., Time, Source, Source Port, Destination, Destination Port, Protocol, Length, Info]
Index: []
