## Reading datasets, preprocessing, and combining

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Original datasets
original_df1 = pd.read_csv('data/USCensus1990_1.csv')
original_df2 = pd.read_csv('data/USCensus1990_2.csv')

# We decided that the caseid was not an important field for the model to pick up on
#     and the resulting synthetic datasets do not have this field.
original_df1 = original_df1.drop('caseid', axis=1)
original_df2 = original_df2.drop('caseid', axis=1)

# Synthetic datasets
synth_df1 = pd.read_csv('data/synthetic_data.csv')
synth_df2 = pd.read_csv('data/synthetic_data_2.csv')

# Because both of these data sets are large, we will just sample 10000 entries from
#     each dataset to avoid jupyter complaining/crashing due to lack of memory.
original_df1 = original_df1.sample(n=10000, random_state=420)
original_df2 = original_df2.sample(n=10000, random_state=420)
synth_df1    = synth_df1.sample(n=10000, random_state=0xbeef)
synth_df2    = synth_df2.sample(n=10000, random_state=0xbeef)

# Lastly, the synthetic datasets are floats, and the original datasets are integers so
#     just adjust them so they are all integers
synth_df1 = synth_df1.astype(int)
synth_df2 = synth_df2.astype(int)

### Combining both synthetic datasets and both original datasets
Keep in mind, the combination of the original datasets is not something which would be done in a real life scenario if this method was used. This is only being done for testing purposes

In [3]:
columns = original_df1.columns # They're all the same

# Using numpy to do this because pd.merge didn't want to cooperate
original_df = np.vstack((original_df1, original_df2))
synth_df = np.vstack((synth_df1, synth_df2))

# Finally, conver them back into dataframes
original_df = pd.DataFrame(original_df, columns=columns)
synth_df = pd.DataFrame(synth_df, columns=columns)

In [4]:
original_df

Unnamed: 0,dAge,dAncstry1,dAncstry2,iAvail,iCitizen,iClass,dDepart,iDisabl1,iDisabl2,iEnglish,...,iTmpabsnt,dTravtime,iVietnam,dWeek89,iWork89,iWorklwk,iWWII,iYearsch,iYearwrk,dYrsserv
0,1,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,0,0
1,7,1,2,0,0,1,3,2,2,0,...,0,5,0,1,1,1,0,11,1,0
2,4,1,1,0,0,2,3,2,2,0,...,0,1,0,2,1,1,0,14,1,0
3,5,11,1,0,0,3,4,2,2,0,...,0,3,0,2,1,1,0,11,1,0
4,3,1,1,0,0,3,4,2,2,0,...,0,4,0,2,1,1,0,10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2,11,2,0,0,1,0,2,2,0,...,3,0,0,1,1,2,0,10,1,0
19996,1,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19997,3,1,1,0,0,1,3,2,2,0,...,0,4,0,2,1,1,0,10,1,0
19998,3,11,1,0,0,1,4,1,2,0,...,0,3,0,2,1,1,0,10,1,2


In [5]:
synth_df

Unnamed: 0,dAge,dAncstry1,dAncstry2,iAvail,iCitizen,iClass,dDepart,iDisabl1,iDisabl2,iEnglish,...,iTmpabsnt,dTravtime,iVietnam,dWeek89,iWork89,iWorklwk,iWWII,iYearsch,iYearwrk,dYrsserv
0,4,9,1,0,0,2,5,2,2,0,...,0,4,0,2,1,1,0,12,1,0
1,5,2,0,0,0,0,0,2,2,0,...,1,0,0,0,2,2,0,15,5,0
2,8,4,2,0,0,0,0,2,2,1,...,4,0,0,0,2,2,0,7,4,0
3,7,0,1,0,0,0,0,2,2,0,...,2,0,0,0,2,2,0,3,6,0
4,2,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,6,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,5,1,4,0,0,3,3,2,2,0,...,0,3,0,2,1,1,0,15,1,0
19996,8,0,4,0,0,0,0,1,1,0,...,3,0,0,0,2,3,0,6,4,0
19997,6,6,4,0,0,1,2,2,2,1,...,0,6,0,2,1,1,0,9,1,0
19998,6,1,4,0,0,4,3,2,2,0,...,0,5,0,2,1,1,0,14,1,1


## Reasoning


The purpose of this notebook is to determine the feasibility of using the synthetic datasets to derrive target attributes using known input attributes. A real world example might consist of knowing attributes A, B and C, querying the synthetic data to find a sample which matches as closely as possible to these input attributes, and using the sample found in the synthetic data to deduce a value for an unknown target attribute. Ideally, the percent errors calculated will be large.

TLDR: We don't want to be able to query the synthetic data with known attributes to infer unknown attributes, and that is what we are testing here

In [6]:
def percent_difference(x, y):
    return abs(x - y) / ((x + y) / 2) * 100

## Preparation

In [7]:
# First we chose a random sample from the original data. Keep in mind, in a real scenario
#     that we will not know every single attribute like we do now. Known input attributes
#     and a target output sample will also be defined below.
random_sample = original_df.sample()

In [8]:
random_sample

Unnamed: 0,dAge,dAncstry1,dAncstry2,iAvail,iCitizen,iClass,dDepart,iDisabl1,iDisabl2,iEnglish,...,iTmpabsnt,dTravtime,iVietnam,dWeek89,iWork89,iWorklwk,iWWII,iYearsch,iYearwrk,dYrsserv
7982,7,2,3,0,0,0,0,2,2,0,...,3,0,0,0,2,2,0,10,6,0


In [9]:
known_attributes_names = [
    'iDisabl1', 'iDisabl2', 'iMilitary', 'dOccup'
]
known_attributes = dict()

for name in known_attributes_names:
    known_attributes[name] = random_sample[name].values[0]

# Assume that this, somehow, is known. We will query the synthetic data using this info
#     to try to determine the random sample's iPerscare value. This is likely the worst
#     case scenario because, referencing the heatmaps, these attributes have pretty strong
#     correlations amongst eachother.
known_attributes

{'iDisabl1': 2, 'iDisabl2': 2, 'iMilitary': 4, 'dOccup': 0}

## Attacking

In [10]:
mask = True
for key, value in known_attributes.items():
    mask &= synth_df[key] == value
attack_sample = synth_df[mask].sample()

In [11]:
# This is probably a little overkill. With this dataset, all values are integers. In
#     reality, you could probably have a little tolerance rather than checking for
#     exact equality.
for name in known_attributes_names:
    assert(attack_sample[name].values[0] == random_sample[name].values[0])

In [12]:
target_attribute = 'iPerscare'
print(f'Random sample target attribute: {random_sample[target_attribute].values[0]}')
print(f'Attack sample target attribute: {attack_sample[target_attribute].values[0]}')

Random sample target attribute: 2
Attack sample target attribute: 2


In a real scenario, the first value shown above would be the value we would ideally want to protect, and the second value shown above is the value that would have been leaked by the attack. Ideally, these two should not have matched.

## Just for the record

Here, we re-run the attack on more random samples to gauge it's accuracy. It doesn't work every time, however the amount of times it does work against the amount of times attempted is still considerable.

In [13]:
total_matches, total_attempts = 0, 100

for i in range(total_attempts):
    
    random_sample = original_df.sample()
    mask = True
    
    for key, value in known_attributes.items():
        mask &= synth_df[key] == value
    attack_sample = synth_df[mask].sample()
    
    value_1 = random_sample[target_attribute].values[0]
    value_2 = attack_sample[target_attribute].values[0]
    
    if value_1 == value_2:
        total_matches = total_matches + 1

In [14]:
print(f'This attack worked {total_matches} times out of {total_attempts}')

This attack worked 77 times out of 100
