In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import itertools
from collections import defaultdict
import io

In [2]:
def gen_data_set(n, atts, m):
    """n is number of people, atts is dict of attribute, m is number of attributes used"""
    db = []
    heads = list(atts.keys())
    for i in range(min(len(heads), m)):
        db.append(np.random.choice(list(atts[heads[i]]), n))
        
    out_db = pd.DataFrame(np.array(db).T.tolist(), columns = heads[0:min(len(heads), m)])
    out_db.insert(0, "row_key",
                np.random.randint(0, 4, out_db.shape[0]))
    return out_db

In [3]:
np.random.seed(0)

bin_atts = {'Income': {'rich', 'poor'}, 
        'Sex': {'male', 'female'}, 
        'Height': {'tall', 'short'},
        'Age': {'old', 'young'},
        'Race': {'a', 'b'},
        'Married': {'married', 'unmarried'},
        'Work': {'working', 'not_working'}
        }
db = gen_data_set(10, bin_atts, 20)
display(db)

Unnamed: 0,row_key,Income,Sex,Height,Age,Race,Married,Work
0,1,poor,male,tall,young,b,unmarried,not_working
1,1,rich,female,short,old,a,married,working
2,2,rich,female,short,young,b,married,not_working
3,0,poor,male,tall,old,a,married,working
4,0,rich,female,tall,young,a,married,not_working
5,1,rich,female,short,young,a,unmarried,working
6,3,rich,female,short,old,a,married,working
7,0,rich,female,short,young,a,unmarried,working
8,1,rich,female,short,young,b,unmarried,working
9,2,rich,male,tall,old,a,married,working


In [4]:
data = io.StringIO('''
value,cell_key,peturbation
0    ,0       ,-1
0    ,1       ,1
0    ,2       ,0
0    ,3       ,1
1    ,0       ,0
1    ,1       ,1
1    ,2       ,0
1    ,3       ,1
2    ,0       ,-1
2    ,1       ,0
2    ,2       ,-1
2    ,3       ,1
3    ,0       ,0
3    ,1       ,1
3    ,2       ,0
3    ,3       ,-1
''')

ptable = pd.read_csv(data).set_index(["value", "cell_key"])
# display(ptable)

In [5]:
def gen_tables_cell_key(db,atts1,atts2,ptable):
    
    counts = pd.crosstab(
    [db[v] for v in atts1],
    [db[v] for v in atts2],
    dropna=False,
    margins=True)
    
    sum_rkey = pd.pivot_table(
        db,
        values=['row_key'],
        index=atts1,
        columns=atts2,
        aggfunc=np.sum,
        fill_value=0,
        dropna=False,
        margins=True)

    cell_keys = sum_rkey.mod(4)
    
    perturbations = counts * 0
    
    for col_idx in range(counts.shape[0]):
        for row_idx in range(counts.shape[1]):
            cell_key = cell_keys.iat[col_idx, row_idx]
            count = counts.iat[col_idx,row_idx]
            perturbation = 0
            if count > 0:
                perturbation = ptable.loc[count%4, cell_key]
            perturbations.iat[col_idx, row_idx] += perturbation
    perturbed_counts = counts + perturbations
    def apply_color(x):
        colors = {-1: 'lightcoral', 0: 'white', 1: 'lightgreen'}
        return perturbations.applymap(lambda val: 'background-color: {}'.format(colors.get(val,'')))
    
    return perturbed_counts.style.apply(apply_color, axis=None)

In [6]:
def gen_tables(db,in_att1,in_att2,in_att3,noise):
    
    dbs = []
    lst = []
    
    att_dict = defaultdict(list)
    att_noise_dict = defaultdict(lambda: defaultdict(int))
    
    total_noise = np.random.randint(-noise,noise+1)

    for att in [in_att1,in_att2,in_att3]:
        cats = list(set(db[att]))
        att_dict[att] = cats
        inner_dict = defaultdict(int)
        for cat in cats:
            inner_dict[cat] = np.random.randint(-noise,noise+1)
        att_noise_dict[att] = inner_dict
        
    for subset in itertools.combinations([in_att1,in_att2,in_att3], 2):
        lst.append(subset)
    
    for comb in lst:
        att1 = comb[0]
        att2 = comb[1]
        true_db = pd.crosstab(index=db[att1], columns = db[att2], margins = True)
        true_db.style.set_caption(f"True: {att1} vs {att2}")
        copy = pd.DataFrame().reindex_like(true_db)
        noise_db = copy.applymap(lambda x: np.random.randint(-noise,noise+1))
        
        i = 1
        for att in [att1,att2]:
            # print(att)
            for cat in att_dict[att]:
                # print(cat)
                if i == 1: 
                    noise_db.at[cat,'All'] = att_noise_dict[att][cat]
                elif i == -1:
                    noise_db.at['All',cat] = att_noise_dict[att][cat]
            i*=-1
        
        
        noise_db.at['All','All'] = total_noise
        
        print(noise_db)
        
        perturbed_db = true_db.add(noise_db)
        perturbed_db.style.set_caption(f"Perturbed: {att1} vs {att2}")
        # dbs.append([true_db,noise_db,perturbed_db])
        dbs.append([true_db,perturbed_db])
        
    return(dbs)

In [7]:
display(gen_tables_cell_key(db,['Income'],['Sex'],ptable))
display(gen_tables_cell_key(db,['Income','Race'],['Sex'],ptable))

Sex,female,male,All
Income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
poor,0,2,2
rich,7,1,8
All,7,2,11


Unnamed: 0_level_0,Sex,female,male,All
Income,Race,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
poor,a,0,1,1
poor,b,0,2,2
rich,a,6,1,7
rich,b,3,0,3
All,,7,2,11


In [223]:
for i in gen_tables(db,'Income',"Sex","Race",1):
    print("____________________")
    for j in i:
        display(j)

Sex     female  male  All
Income                   
poor        -1     0    0
rich         0     1   -1
All          0     0   -1
Race    a  b  All
Income           
poor    0  0    0
rich    1  1   -1
All    -1 -1   -1
Race    a  b  All
Sex              
female  1  0    0
male   -1  0    0
All    -1 -1   -1
____________________


Sex,female,male,All
Income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
poor,2,3,5
rich,2,3,5
All,4,6,10


Sex,female,male,All
Income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
poor,1,3,5
rich,2,4,4
All,4,6,9


____________________


Race,a,b,All
Income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
poor,2,3,5
rich,1,4,5
All,3,7,10


Race,a,b,All
Income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
poor,2,3,5
rich,2,5,4
All,2,6,9


____________________


Race,a,b,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,2,2,4
male,1,5,6
All,3,7,10


Race,a,b,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,3,2,4
male,0,5,6
All,2,6,9
