## Generate a sample database
Create a sample database that takes a number of records, randomly assigns a given attribute of concern and removes a record from each of the parallel databases generated from the original.
With this parallel databases, we can then perform similarity checks

In [188]:
# Import modules
import torch

In [189]:
# Create a parallel db generator which removes one record from the original db and keeps the rest of the records
def par_db_generator(db, idx):
    return torch.cat((db[0:idx], db[idx+1:]))

In [190]:
# Backle up the parallel databases together
# it depends entirely on the length of the first db
def par_dbs(db):
    """This creates an array with the datase columns = len(db-1)
    Since we are removing one record at a time and returning the remaining values
    """
    db_all = []
    
    for i in range(len(db)):
        par_db = par_db_generator(db, i)
        db_all.append(par_db)
        
    return db_all

In [191]:
# Create a function to generate the required data based on a given sample of users or records
def generate_db(rec_no):
    """This function generates the db that is then usd to generate the parallel dbs"""
    db = torch.rand(rec_no)>0.5
    # print('Original DB\n{}'.format(db))
    
    #Generate the entire parallel db
    pdbs = par_dbs(db)
    
    return db, pdbs    

In [192]:
# torch.manual_seed(500)
# db, pdbs = generate_db(50000)
# db

In [193]:
# Query the databases (for this case, the sum)
def query_threshold(db, threshold):
    return (db.sum()>threshold).float()

In [194]:
def db_query(db):
    return db.float().mean()

In [195]:
# db_result = db_query(db)

In [196]:
# db_result

In [197]:
# Query all the other paraleel DBs and compare with the full database query
def sensitivity(query, num_entries=1000):
    """
    This function takes the query function as an argument and the number of entries
    It initializes the databse and the parallel databases and then performs query for both
    It then uses the qery result to eveluate the similarity
    """
    
    # Generate the data
#     torch.manual_seed(500)
    sensitivity_rs = []
    db, pdbs = generate_db(num_entries)
    
    
    ful_db_threshold = query_threshold(db, 5)
    print(ful_db_threshold)

    max_dist = 0
    for ii, pdb in enumerate(pdbs):
        pdb_threshold = query_threshold(pdb, 5)
        print('pdb',ii+1, pdb_threshold)
        
        if ful_db_threshold and pdb_threshold:
            ful_db_query = query(db)
            
            pdb_query = query(pdb)

            # Creating an L1 Sensitivity

            db_distance = torch.abs(pdb_query - ful_db_query)

            if db_distance > max_dist:
                max_dist = db_distance
    return max_dist

In [198]:
print(sensitivity(db_query, 10))

tensor(0.)
pdb 1 tensor(0.)
pdb 2 tensor(0.)
pdb 3 tensor(0.)
pdb 4 tensor(0.)
pdb 5 tensor(0.)
pdb 6 tensor(0.)
pdb 7 tensor(0.)
pdb 8 tensor(0.)
pdb 9 tensor(0.)
pdb 10 tensor(0.)
0


In [199]:
# 10 databases of size 10
for dd in range(10):
    print(sensitivity(db_query, 10))

tensor(0.)
pdb 1 tensor(0.)
pdb 2 tensor(0.)
pdb 3 tensor(0.)
pdb 4 tensor(0.)
pdb 5 tensor(0.)
pdb 6 tensor(0.)
pdb 7 tensor(0.)
pdb 8 tensor(0.)
pdb 9 tensor(0.)
pdb 10 tensor(0.)
0
tensor(1.)
pdb 1 tensor(1.)
pdb 2 tensor(0.)
pdb 3 tensor(0.)
pdb 4 tensor(0.)
pdb 5 tensor(0.)
pdb 6 tensor(0.)
pdb 7 tensor(1.)
pdb 8 tensor(1.)
pdb 9 tensor(0.)
pdb 10 tensor(1.)
tensor(0.0667)
tensor(1.)
pdb 1 tensor(0.)
pdb 2 tensor(1.)
pdb 3 tensor(0.)
pdb 4 tensor(1.)
pdb 5 tensor(0.)
pdb 6 tensor(0.)
pdb 7 tensor(0.)
pdb 8 tensor(0.)
pdb 9 tensor(1.)
pdb 10 tensor(1.)
tensor(0.0667)
tensor(1.)
pdb 1 tensor(1.)
pdb 2 tensor(1.)
pdb 3 tensor(1.)
pdb 4 tensor(1.)
pdb 5 tensor(1.)
pdb 6 tensor(1.)
pdb 7 tensor(1.)
pdb 8 tensor(1.)
pdb 9 tensor(1.)
pdb 10 tensor(1.)
tensor(0.0889)
tensor(1.)
pdb 1 tensor(1.)
pdb 2 tensor(1.)
pdb 3 tensor(0.)
pdb 4 tensor(0.)
pdb 5 tensor(0.)
pdb 6 tensor(1.)
pdb 7 tensor(0.)
pdb 8 tensor(0.)
pdb 9 tensor(1.)
pdb 10 tensor(0.)
tensor(0.0667)
tensor(1.)
pdb 1 tensor(1.)
