In [1]:
"""
Module to test for t-closeness in the dataset.
"""

import pandas as pd
import pytest


def read_example_dataset():
    """
    Read example dataset
    """
    example_dataframe = pd.read_csv('IT Salary Survey EU  2020.csv')
    return example_dataframe
    


In [99]:
dataset = pd.read_csv("HW3.csv")
q_identifiers = ['Sex', 'Age']
sensitive_column = 'Drinks/Day'

#Requires s_attribute to be integers. If d_metric is 0, use equal ground distance. Otherwise use euclidean distance.
def get_t_closeness(dataset, q_identifiers, s_attribute, d_metric):
    #1. Need proportion of each answer from full table
    #2. Split data up into each equivalence class
    #2a. For each equivalence class, calculate the proportion of each answer
    #2b. Find the difference of the full and equivalence class proportions
    #2c. Use the formula associated with each distance metric to output t-closeness for that equivalence class
    full_prop = get_proportions(dataset[s_attribute])
    grouped = dataset.groupby(q_identifiers)
    t_list = []
    for name, group in grouped:
        t_list.append([name, get_t_closeness_eqv(group, full_prop, s_attribute, d_metric)])
    t_list = pd.DataFrame(t_list)
    return max(pd.DataFrame(t_list)[1])
    

# Dataset to compare, full proportions, sensitive attribute, distance metric
def get_t_closeness_eqv(dataset, full, s_att, d_metric):
    s = dataset[s_att]
    eqv_prop = get_proportions(s)
    diff_prop = (full - eqv_prop).fillna(full)
    t = 0
    if d_metric == 0:
        t = sum(abs(diff_prop))/(len(diff_prop)-1)
    else:
        ind = diff_prop.index
        r_sum = 0
        r = []
        for i in range(min(ind), max(ind) + 1):
            try:
                r.append(diff_prop.loc[i])
            except KeyError:
                r.append(0)
            r_sum += abs(sum(r))
        t = r_sum/(len(r)-1)
    return t
    
        

def get_proportions(s):
    s_counts = s.value_counts()
    s_counts = s_counts.sort_index()
    return s_counts/sum(s_counts)
        
    
        


In [100]:
eqv_test = dataset[(dataset.Sex == 'M') & (dataset.Age == 31)]
x = get_proportions(dataset[sensitive_column])
prop = get_t_closeness_eqv(eqv_test, x, sensitive_column, 0)
prop_2 = get_t_closeness_eqv(eqv_test, x, sensitive_column, 1)
prop_2

0.11868686868686872

In [101]:
print(get_t_closeness(dataset, q_identifiers, sensitive_column, 0))
print(get_t_closeness(dataset, q_identifiers, sensitive_column, 1))

0.09791666666666667
0.11868686868686872


In [104]:
test = get_t_closeness(dataset, q_identifiers, sensitive_column, 0)
test

0.09791666666666667

In [109]:
df = read_example_dataset()
df['Total years of experience'] = pd.to_numeric(df['Total years of experience'], 
                                                errors = 'coerce', 
                                                downcast = 'integer').fillna(0).astype(int)
df

Unnamed: 0,Timestamp,Age,Gender,City,Position,Total years of experience,Years of experience in Germany,Seniority level,Your main technology / programming language,Other technologies/programming languages you use often,...,Annual bonus+stocks one year ago. Only answer if staying in same country,Number of vacation days,Employment status,Сontract duration,Main language at work,Company size,Company type,Have you lost your job due to the coronavirus outbreak?,"Have you been forced to have a shorter working week (Kurzarbeit)? If yes, how many hours per week","Have you received additional monetary support from your employer due to Work From Home? If yes, how much in 2020 in EUR"
0,24/11/2020 11:14:15,26.0,Male,Munich,Software Engineer,5,3,Senior,TypeScript,"Kotlin, Javascript / Typescript",...,10000,30,Full-time employee,Unlimited contract,English,51-100,Product,No,,
1,24/11/2020 11:14:16,26.0,Male,Berlin,Backend Developer,7,4,Senior,Ruby,,...,5000,28,Full-time employee,Unlimited contract,English,101-1000,Product,No,,
2,24/11/2020 11:14:21,29.0,Male,Berlin,Software Engineer,12,6,Lead,Javascript / Typescript,"Javascript / Typescript, Docker",...,100000,30,Self-employed (freelancer),Temporary contract,English,101-1000,Product,Yes,,
3,24/11/2020 11:15:24,28.0,Male,Berlin,Frontend Developer,4,1,Junior,Javascript,,...,,24,Full-time employee,Unlimited contract,English,51-100,Startup,No,,
4,24/11/2020 11:15:46,37.0,Male,Berlin,Backend Developer,17,6,Senior,C# .NET,".NET, SQL, AWS, Docker",...,,29,Full-time employee,Unlimited contract,English,101-1000,Product,No,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,18/01/2021 15:05:50,31.0,Male,Berlin,Backend Developer,9,5,Senior,Java,"Python, Javascript / Typescript, Java / Scala,...",...,72000,26,Full-time employee,Unlimited contract,English,51-100,Product,Yes,,
1249,18/01/2021 17:46:02,33.0,Male,Berlin,Researcher/ Consumer Insights Analyst,10,1.5,Senior,consumer analysis,,...,2500,unlimited,Full-time employee,Unlimited contract,English,1000+,Product,No,,0
1250,18/01/2021 23:20:35,39.0,Male,Munich,IT Operations Manager,15,2,Lead,PHP,"Python, C/C++, Javascript / Typescript, Java /...",...,,28,Full-time employee,Unlimited contract,English,101-1000,eCommerce,No,,
1251,19/01/2021 10:17:58,26.0,Male,Saarbrücken,Frontend Developer,7,7,Middle,JavaScript,"Javascript / Typescript, Docker, HTML, CSS; Ad...",...,36400,27,Full-time employee,Unlimited contract,German,101-1000,Product,No,,0


In [110]:

print(get_t_closeness(df, ['Age', 'City'], 'Total years of experience', 0))
print(get_t_closeness(df, ['Age', 'City'], 'Total years of experience', 1))

0.06055769184260804
0.9767492743264744


In [107]:
df = read_example_dataset()
df

Unnamed: 0,Timestamp,Age,Gender,City,Position,Total years of experience,Years of experience in Germany,Seniority level,Your main technology / programming language,Other technologies/programming languages you use often,...,Annual bonus+stocks one year ago. Only answer if staying in same country,Number of vacation days,Employment status,Сontract duration,Main language at work,Company size,Company type,Have you lost your job due to the coronavirus outbreak?,"Have you been forced to have a shorter working week (Kurzarbeit)? If yes, how many hours per week","Have you received additional monetary support from your employer due to Work From Home? If yes, how much in 2020 in EUR"
0,24/11/2020 11:14:15,26.0,Male,Munich,Software Engineer,5,3,Senior,TypeScript,"Kotlin, Javascript / Typescript",...,10000,30,Full-time employee,Unlimited contract,English,51-100,Product,No,,
1,24/11/2020 11:14:16,26.0,Male,Berlin,Backend Developer,7,4,Senior,Ruby,,...,5000,28,Full-time employee,Unlimited contract,English,101-1000,Product,No,,
2,24/11/2020 11:14:21,29.0,Male,Berlin,Software Engineer,12,6,Lead,Javascript / Typescript,"Javascript / Typescript, Docker",...,100000,30,Self-employed (freelancer),Temporary contract,English,101-1000,Product,Yes,,
3,24/11/2020 11:15:24,28.0,Male,Berlin,Frontend Developer,4,1,Junior,Javascript,,...,,24,Full-time employee,Unlimited contract,English,51-100,Startup,No,,
4,24/11/2020 11:15:46,37.0,Male,Berlin,Backend Developer,17,6,Senior,C# .NET,".NET, SQL, AWS, Docker",...,,29,Full-time employee,Unlimited contract,English,101-1000,Product,No,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,18/01/2021 15:05:50,31.0,Male,Berlin,Backend Developer,9,5,Senior,Java,"Python, Javascript / Typescript, Java / Scala,...",...,72000,26,Full-time employee,Unlimited contract,English,51-100,Product,Yes,,
1249,18/01/2021 17:46:02,33.0,Male,Berlin,Researcher/ Consumer Insights Analyst,10,1.5,Senior,consumer analysis,,...,2500,unlimited,Full-time employee,Unlimited contract,English,1000+,Product,No,,0
1250,18/01/2021 23:20:35,39.0,Male,Munich,IT Operations Manager,15,2,Lead,PHP,"Python, C/C++, Javascript / Typescript, Java /...",...,,28,Full-time employee,Unlimited contract,English,101-1000,eCommerce,No,,
1251,19/01/2021 10:17:58,26.0,Male,Saarbrücken,Frontend Developer,7,7,Middle,JavaScript,"Javascript / Typescript, Docker, HTML, CSS; Ad...",...,36400,27,Full-time employee,Unlimited contract,German,101-1000,Product,No,,0


In [108]:
test = get_t_closeness(dataset, q_identifiers, sensitive_column, 1)
testdf = pd.DataFrame(test)
testdf.columns = ['q_identifiers', 't_closeness']
print(testdf)
max(testdf['t_closeness'])

ValueError: DataFrame constructor not properly called!