In [1]:
import numpy as np
import galois
import random
import pandas as pd
bits = 128

from collections import defaultdict
from collections import Counter
from collections import namedtuple
from operator import truediv

In [2]:
def keygen(bits):
    """Generates keys with `bits`-bits of security. Returns a pair: (secret key, public key)."""
    def invmod(x, m):
        gcd, s, t = galois.egcd(x, m)
        assert gcd == 1
        return s

    p = galois.random_prime(int(bits/2))
    q = galois.random_prime(int(bits/2))

    n = p*q
    g = n+1
    lamb = (p-1) * (q-1)
    mu = invmod(lamb, n)
    
    sk = (lamb, mu)
    pk = (n, g)
    return sk, pk

def encrypt(m, pk):
    """Encrypts the message `m` with public key `pk`."""
    n, g = pk
    n_sq = n**2
    r = random.randint(1, n)
    c = (pow(g, m, n_sq) * pow(r, n, n_sq)) % n_sq
    return c

def decrypt(c, sk, pk):
    """Decrypts the ciphertext `c` using secret key `sk` and public key `pk`."""
    lamb, mu = sk
    n, g = pk
    n_sq = n**2
    L_result = (pow(c, lamb, n_sq) - 1)//n
    return (L_result * mu) % n

def e_add_vec(ct, pk):
    """Add one encrypted integer to another"""
    n, g = pk
    sums = []
    sums.append(ct[0] * ct[1] % n**2)
    for i in range(2, len(ct)):
        sums.append(sums[-1] * ct[i] % n**2)

    return sums[-1]

In [53]:
class GenderPayGapSurveyParticipant:
    def submit_salary(self, salary, gender, age, server):
        """Submits an encrypted survey response to the server"""
        pk = server.get_public_key()
        gender_dict = {"Male": 1 , "Female": 0}
        age_bands = [16, 26, 36, 46, 56, 66, 76, 86, 96, 106, 116]
        temp_plc_holders = [0 for _ in range(len(age_bands)-1)]
        
        # Generate temporary vectors where subvector position denotes which gender: pos 1 is male pay pands pos 0 is female pay bands
        temp_sal = [temp_plc_holders, temp_plc_holders]
        temp_band_counts = [temp_plc_holders, temp_plc_holders]

        # Create pay bands and band counts

        sal_bands = [salary if age>=age_bands[i] and age<age_bands[i+1] else 0 for i in range(len(age_bands)-1)]
        band_counts = [1 if age>=age_bands[i] and age<age_bands[i+1] else 0 for i in range(len(age_bands)-1)]

        # Now populate these vectors with the apprporiate salary bands and band counts, while leavint the other index an empty vector of zeros 
        # This allows for encoding of subvectors that will not leak which gender for the given pay bend 
        gender_dict_ind = gender_dict[gender]
        temp_sal[gender_dict_ind] = sal_bands
        temp_band_counts[gender_dict_ind] = band_counts
        
        enc_sal = [[encrypt(sal, pk) for sal in gen_sals] for gen_sals in temp_sal]
        enc_band_ct = [[encrypt(ct, pk) for ct in gen_cts] for gen_cts in temp_band_counts]

        server.submit_salary(enc_sal, enc_band_ct)
        
class GenderPayGapSurveyServer:
    def __init__(self):
        self.salaries = []
        self.band_cts = []
        self.sk, self.pk = keygen(32)
    
    def get_public_key(self):
        return self.pk
        
    def submit_salary(self, ct_salary_vector, ct_gender_vector):
        """Store an entry in the survey"""
        self.salaries.append(ct_salary_vector)
        self.band_cts.append(ct_gender_vector)
        
    
    def show_salaries(self):
        """Display the (encrypted) submitted salaries"""
        return self.salaries
    
    def compute_average_salaries(self):
        """Tally the results, decrypt, and return a 2-tuple: (average female salary, average male salary)"""
    
        full_dec = []
        for enc_vals in [self.salaries, self.band_cts]:
            arr_encs = np.array(enc_vals)
            arr_shape = list(arr_encs.shape)
            w_sums = []
            m_sums = []

            for i in range(arr_shape[2]):
                w_sums.append(e_add_vec(arr_encs[:, :, i][:,0], self.pk))
                m_sums.append(e_add_vec(arr_encs[:, :, i][:,1], self.pk))
            
            for sums in [w_sums, m_sums]:
                full_dec.append([decrypt(sum, self.sk, self.pk) for sum in sums])
        
        reorder = [0, 2, 1, 3]
        full_dec_gen_break = [full_dec[i] for i in reorder]
        
        age_bands_ranges = [str('16-25'), str('26-35'), str('36-45'), str('46-55'), str('56-65'), str('66-75'), \
                            str('76-85'), str('86-95'), str('96-105'), str('106-115')]
        female_sal = [round(i / j,2) if i != 0 else 0 for i, j in zip(full_dec_gen_break[0], full_dec_gen_break[1])] 
        male_sal = [round(i / j,2) if i != 0 else 0 for i, j in zip(full_dec_gen_break[2], full_dec_gen_break[3])]
        
        sal_info = {"Female Salaries": female_sal, "Male Salaries": male_sal} 
        salary_df = pd.DataFrame(sal_info, index = age_bands_ranges)
        salary_df.index.name = 'Age Bands'
        return salary_df

In [54]:
s = GenderPayGapSurveyServer()
GenderPayGapSurveyParticipant().submit_salary(10000, 'Male', 22, s)
GenderPayGapSurveyParticipant().submit_salary(30000, 'Female', 16 ,s)
GenderPayGapSurveyParticipant().submit_salary(15000, 'Male', 23, s)
GenderPayGapSurveyParticipant().submit_salary(20000, 'Female', 65, s)
GenderPayGapSurveyParticipant().submit_salary(20000, 'Female', 70, s)
s.compute_average_salaries()

Unnamed: 0_level_0,Female Salaries,Male Salaries
Age Bands,Unnamed: 1_level_1,Unnamed: 2_level_1
16-25,30000.0,12500.0
26-35,0.0,0.0
36-45,0.0,0.0
46-55,0.0,0.0
56-65,20000.0,0.0
66-75,20000.0,0.0
76-85,0.0,0.0
86-95,0.0,0.0
96-105,0.0,0.0
106-115,0.0,0.0


In [62]:
salary_df = pd.read_csv('Salary_Data.csv')
salary_df = salary_df.dropna()
salary_df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
370,35.0,Female,Bachelor's,Senior Marketing Analyst,8.0,85000.0
371,43.0,Male,Master's,Director of Operations,19.0,170000.0
372,29.0,Female,Bachelor's,Junior Project Manager,2.0,40000.0
373,34.0,Male,Bachelor's,Senior Operations Coordinator,7.0,90000.0


In [63]:
g = GenderPayGapSurveyServer()
for i in range(len(salary_df)+2):
    try:
        GenderPayGapSurveyParticipant().submit_salary(int(salary_df['Salary'][i]), salary_df['Gender'][i] , \
                                                  int(salary_df['Age'][i]), g)
    except KeyError as err:
        pass
g.compute_average_salaries()

Unnamed: 0_level_0,Female Salaries,Male Salaries
Age Bands,Unnamed: 1_level_1,Unnamed: 2_level_1
16-25,36666.67,35000.0
26-35,57430.56,64893.96
36-45,114695.12,123253.97
46-55,168863.64,172297.3
56-65,0.0,0.0
66-75,0.0,0.0
76-85,0.0,0.0
86-95,0.0,0.0
96-105,0.0,0.0
106-115,0.0,0.0


In [72]:
def getting_df(df: pd.DataFrame, lower: int, upper: int):
    band = df[df['Age'] >= lower]
    band = band[band['Age'] <= upper]
    return band

def get_salary_means(df: pd.DataFrame, band:tuple):
    females = df[df['Gender'] == 'Female']
    males = df[df['Gender'] == 'Male']
    return round(females['Salary'].mean(), 2), round(males['Salary'].mean(), 2)

age = [(16,25), (26,35), (36,45), (46,55), (56,65), (66,75), (76,85), (86,95), (96,105), (106,115)]

female_sal = []
male_sal = []
for i in age:
    lower, upper = i
    df = getting_df(salary_df,lower,upper)
    f, m = get_salary_means(df,i)
    female_sal.append(f)
    male_sal.append(m)
    
age_bands_ranges = [str('16-25'), str('26-35'), str('36-45'), str('46-55'), str('56-65'), str('66-75'), \
                            str('76-85'), str('86-95'), str('96-105'), str('106-115')]
sal_info = {"Female Salaries": female_sal, "Male Salaries": male_sal} 
salary_mean_df = pd.DataFrame(sal_info, index = age_bands_ranges)
salary_mean_df.index.name = 'Age Bands'
salary_mean_df = salary_mean_df.fillna(0)
salary_mean_df


Unnamed: 0_level_0,Female Salaries,Male Salaries
Age Bands,Unnamed: 1_level_1,Unnamed: 2_level_1
16-25,36666.67,35000.0
26-35,57430.56,64893.96
36-45,114695.12,123253.97
46-55,168863.64,172297.3
56-65,0.0,0.0
66-75,0.0,0.0
76-85,0.0,0.0
86-95,0.0,0.0
96-105,0.0,0.0
106-115,0.0,0.0
