# Simple Reidentifaction Attack

Simple example that performs a reidentification attack on network data and synthetically generated network data.<br>
Example based on smarnoise-samples/whitepaper-demos

In [182]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [183]:
# path to the folder containg data csv
dataPath = '../data/New_NETWORK_TEST/NEW_NETWORK_TEST/'

In [184]:
# Read files
# assume attack has the entire original network dataset
def readCSV(data, drop=None):
    # data is the path of the csv file
    try:
        df_data = pd.read_csv(data, sep=",", encoding="utf-8").infer_objects()
        if drop:
            try:
                df_data = df_data.drop(drop, axis=1)
            except:
                print(f'Cannot drop column {drop} from dataframe. Column {drop} not found')

    except Exception as e:
        print(e)
        df_data = None
    
    return df_data

In [185]:
# Given a df of indices of potential matches (matches), the original and synthethic data,
# return a dataframe of the appropriate rows from original and synth
def get_matches(matches, original, synth):
    col = pd.concat([pd.Series('index_original'), pd.Series(original.columns+'_original'), pd.Series('index_synth'), pd.Series(synth.columns+'_synth')]) 
    match_data = pd.DataFrame(columns=col)
    for index, row in matches.iterrows():
        original_row = [row[0]] + original.iloc[row[0],:].tolist()
        synth_row = [row[1]] + synth.iloc[row[1],:].tolist()       
        temp = pd.DataFrame([original_row + synth_row], columns=col)
        match_data = pd.concat([match_data, temp])
        
    return match_data

In [187]:
def try_reidentification_noise2(synth, net):
    col = pd.concat([pd.Series('index_original'), pd.Series(net.columns+'_original'), pd.Series('index_synth'), pd.Series(synth.columns+'_synth')]) 
   
    reident_50 = pd.DataFrame(columns=['index_original', 'index_synth']) # 50% match
    reident_75 = pd.DataFrame(columns= ['index_original', 'index_synth']) # 75% match
    reident_100 = pd.DataFrame(columns= ['index_original', 'index_synth']) # 100% match
    
    num_columns = len(net.columns)
    percent_50 = int(np.round(num_columns/2)) # num of matched columns for 50%
    percent_75 = int(np.round((3*num_columns)/4)) # num of matched columns for 75%

    for index1, row1 in tqdm(net.iterrows(), total=net.shape[0]):
            # Here list all columns that are needed for reidentification
            # will try to find a match between rows in the synthetic data and the information
            # the attacker has (original network dataset)
            for index2, row2 in synth.iterrows():
                # get the num of columns that match for row1 and row2
                match = sum((row1 == row2).astype(int).tolist())
                
                # determine possible reidentification
                # Don't double count, i.e. match of 75% will not appera as match of > 25%
                if match == num_columns:
                    temp = pd.DataFrame([[index1, index2]], columns=['index_original', 'index_synth'])
                    reident_100 = pd.concat([reident_100, temp])
                elif match >= percent_75:
                    temp = pd.DataFrame([[index1, index2]], columns=['index_original', 'index_synth'])
                    reident_75 = pd.concat([reident_75, temp])
                elif match >= percent_50:
                    temp = pd.DataFrame([[index1, index2]], columns=['index_original', 'index_synth'])
                    reident_50 = pd.concat([reident_50, temp])
    
    print(f"Identified {len(reident_100)} potential matches (100%)!")
    print(f"Identified {len(reident_75)} potential matches (75%)!")
    print(f"Identified {len(reident_50)} potential matches (50%)!")
    return reident_100, reident_75, reident_50

In [188]:
original = readCSV(dataPath+'original.csv', drop='Attack')

In [189]:
ctgan_1 = readCSV(dataPath+'ctgan_1.csv', drop='Attack')
ct_gan_1_attack_100, ct_gan_1_attack_75, ct_gan_1_attack_50 = try_reidentification_noise2(ctgan_1, original)

  0%|          | 0/1028 [00:00<?, ?it/s]

100%|██████████| 1028/1028 [02:01<00:00,  8.48it/s]

Identified 0 potential matches (100%)!
Identified 0 potential matches (75%)!
Identified 9 potential matches (50%)!





In [190]:
ct_gan_1_attack_50_matches = get_matches(ct_gan_1_attack_50, original, ctgan_1)
print(ct_gan_1_attack_50_matches)

  index_original  Time_original Source_original Source Port_original  \
0             10          66.15       CAMERA_IP      CAM_TO_DOM_PORT   
0             11           0.05       CAMERA_IP      CAM_TO_DOM_PORT   
0             12          28.72       CAMERA_IP      CAM_TO_DOM_PORT   
0             15           0.02       DOMAIN_IP             DOM_PORT   
0             21           0.03       DOMAIN_IP             DOM_PORT   
0             45           0.03       CAMERA_IP      CAM_TO_DOM_PORT   
0             50           0.13       CAMERA_IP      CAM_TO_DOM_PORT   
0             68           0.30       CAMERA_IP      CAM_TO_DOM_PORT   
0            555           0.22       CAMERA_IP      CAM_TO_DOM_PORT   

  Destination_original Destination Port_original Protocol_original  \
0            DOMAIN_IP                  DOM_PORT               TCP   
0            DOMAIN_IP                  DOM_PORT               TCP   
0            DOMAIN_IP                  DOM_PORT               TCP   

In [None]:
ctgan_2 = readCSV(dataPath+'ctgan_2.csv')

In [None]:
ctgan_dp_clip = readCSV(dataPath+'ctgan_dp_clip.csv')

In [None]:
ctgan_dp_gan = readCSV(dataPath+'ctgan_dp_gan.csv')

In [None]:
ctgan_dp_sgd = readCSV(dataPath+'ctgan_dp_sgd.csv')

In [None]:
kg_ctgan = readCSV(dataPath+'kg_ctgan.csv')

In [None]:
octgan = readCSV(dataPath+'octgan.csv')

In [None]:
pategan = readCSV(dataPath+'pategan.csv')

In [None]:
tablegan = readCSV(dataPath+'tablegan.csv')

In [None]:
tvae_1 = readCSV(dataPath+'tvae_1.csv')

In [None]:
tvae_2 = readCSV(dataPath+'tvae_2.csv')

OLD

In [None]:
def try_reidentification_noise(synth, net):
    reident = pd.DataFrame(columns = synth.columns)
    net_data = net.copy()
    for index, row in tqdm(net_data.iterrows(), total=net_data.shape[0]):
        # Here list all columns that are needed for reidentification
        # will try to find a match between rows in the synthetic data and the information
        # the attacker has (original network dataset)
        filtered = synth.loc[(synth['Protocol'] == row['Protocol']) & (synth['Destination Port'] == row['Destination Port'])
                             & (synth['Source Port'] == row['Source Port']) & (synth['Destination'] == row['Destination'])
        ]#& (synth['Time'] == row['Time'])]
        # potential match
        if len(filtered) != 0:
            reident = pd.concat([reident,filtered])
        print(filtered)
        break

    print(f"Identified {len(reident)} potential matches!")
    return reident

In [5]:
# Perform the attack - No matches found
reident_attack = try_reidentification_noise(df_synth, df_network)

100%|██████████| 778792/778792 [6:52:28<00:00, 31.47it/s]       


Identified 0 potential matches!


In [6]:
print(reident_attack)

Empty DataFrame
Columns: [No., Time, Source, Source Port, Destination, Destination Port, Protocol, Length, Info]
Index: []
