In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import matplotlib
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, plot_roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn import naive_bayes
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

In [None]:
df = pd.read_csv("https://github.com/Anjana-premkumar/K_anonymity/blob/1fe78e94edf002d530bc2138280058bda88054b6/survey.csv?raw=true")
df.drop(columns=['Timestamp', 'Country', 'state', 'comments'], inplace = True)
df.drop(df[df['Age'] < 0].index, inplace = True) 
df.drop(df[df['Age'] > 100].index, inplace = True)

df = df[df['work_interfere'].notna()]


df['self_employed'] = df['self_employed'].fillna('No')
df['work_interfere']=df['work_interfere'].replace(['Sometimes','Never','Rarely','Often'],[1,0,1,1])
print(df['work_interfere'].unique())


df['Gender'].replace(['Male ', 'male', 'M', 'm', 'Male', 'Cis Male',
                     'Man', 'cis male', 'Mail', 'Male-ish', 'Male (CIS)',
                      'Cis Man', 'msle', 'Malr', 'Mal', 'maile', 'Make',], 'Male', inplace = True)

df['Gender'].replace(['Female ', 'female', 'F', 'f', 'Woman', 'Female',
                     'femail', 'Cis Female', 'cis-female/femme', 'Femake', 'Female (cis)',
                     'woman',], 'Female', inplace = True)

df["Gender"].replace(['Female (trans)', 'queer/she/they', 'non-binary',
                     'fluid', 'queer', 'Androgyne', 'Trans-female', 'male leaning androgynous',
                      'Agender', 'A little about you', 'Nah', 'All',
                      'ostensibly male, unsure what that really means',
                      'Genderqueer', 'Enby', 'p', 'Neuter', 'something kinda male?',
                      'Guy (-ish) ^_^', 'Trans woman',], 'Other', inplace = True)

df.rename({'self_employed' : 'Self_Employed', 'family_history' : 'Family_History', 
           'treatment' : 'Treatment', 'work_interfere' : 'Work_Interfere', 
           'no_employees': 'Employee_Numbers', 'remote_work': 'Remote_Work', 'tech_company': 'Tech_Company', 
           'benefits': 'Benefits', 'care_options': 'Care_Options', 'wellness_program': 'Wellness_Program', 
           'seek_help': 'Seek_Help', 'anonymity': 'Anonymity', 'leave': 'Medical_Leave', 
           'mental_health_consequence': 'Mental_Health_Consequence', 
           'phys_health_consequence': 'Physical_Health_Consequence', 'coworkers': 'Coworkers', 
           'supervisor': 'Supervisor', 'mental_health_interview': 'Mental_Health_Interview', 
           'phys_health_interview': 'Physical_Health_Interview', 'mental_vs_physical': 'Mental_VS_Physical', 
           'obs_consequence': 'Observed_Consequence'} , inplace = True , axis = 1)



mode_onehot_pipe = Pipeline([
    ('encoder', SimpleImputer(strategy = 'most_frequent')),
    ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])

transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), ['Gender', 'Family_History', 'Employee_Numbers',
                                                           'Remote_Work', 'Tech_Company', 'Benefits', 'Care_Options',
                                                           'Wellness_Program', 'Seek_Help', 'Anonymity',
                                                           'Medical_Leave', 'Mental_Health_Consequence',
                                                           'Physical_Health_Consequence', 'Coworkers', 'Supervisor',
                                                           'Mental_Health_Interview', 'Physical_Health_Interview',
                                                           'Mental_VS_Physical', 'Observed_Consequence','Treatment','Self_Employed']),
    #('mode_onehot_pipe', mode_onehot_pipe, ['Self_Employed', 'Treatment']),
    ('iterative', IterativeImputer(max_iter = 10, random_state = 0), ['Age'])])


X = df.drop('Work_Interfere', axis = 1)
y = df['Work_Interfere']

[1 0]


In [None]:
len(df)

991

In [None]:
df.head()

Unnamed: 0,Age,Gender,Self_Employed,Family_History,Treatment,Work_Interfere,Employee_Numbers,Remote_Work,Tech_Company,Benefits,...,Anonymity,Medical_Leave,Mental_Health_Consequence,Physical_Health_Consequence,Coworkers,Supervisor,Mental_Health_Interview,Physical_Health_Interview,Mental_VS_Physical,Observed_Consequence
0,37,Female,No,No,Yes,1,6-25,No,Yes,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,Male,No,No,No,1,More than 1000,No,No,Don't know,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,No,No,No,1,6-25,No,Yes,No,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,No,Yes,Yes,1,26-100,No,Yes,No,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,No,No,No,0,100-500,Yes,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


In [None]:

column_names=['Age','Gender','Self_Employed', 'Family_History','Treatment','Work_Interfere', 'Employee_Numbers',
                                                           'Remote_Work', 'Tech_Company', 'Benefits', 'Care_Options',
                                                           'Wellness_Program', 'Seek_Help', 'Anonymity',
                                                           'Medical_Leave', 'Mental_Health_Consequence',
                                                           'Physical_Health_Consequence', 'Coworkers', 'Supervisor',
                                                           'Mental_Health_Interview', 'Physical_Health_Interview',
                                                           'Mental_VS_Physical', 'Observed_Consequence']

categorical=['Gender','Self_Employed', 'Family_History','Treatment', 'Employee_Numbers',
                                                           'Remote_Work', 'Tech_Company', 'Benefits', 'Care_Options',
                                                           'Wellness_Program', 'Seek_Help', 'Anonymity',
                                                           'Medical_Leave', 'Mental_Health_Consequence',
                                                           'Physical_Health_Consequence', 'Coworkers', 'Supervisor',
                                                           'Mental_Health_Interview', 'Physical_Health_Interview',
                                                           'Mental_VS_Physical', 'Observed_Consequence']
numeric=["Age",'Work_Interfere']
#encapsulating columns with indices in a dictionary
value = zip([x for x in range(23)], column_names)
column_dict = dict(value)


categorical_dict = {k: v for k, v in column_dict.items() if v in categorical}
categorical_indices = [k for k, v in categorical_dict.items()]

#Find the number of tuples that have the same attribute values
m = df.groupby(df.columns.tolist(),as_index=False).size()['size'].max() 
print(m)



2


In [None]:
def model_evaluation(model, metric):
    model_cv = cross_val_score(model, X_train, y_train, cv = StratifiedKFold(n_splits = 5), scoring = metric)
    return model_cv

In [None]:
transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), ['Gender', 'Family_History', 'Employee_Numbers',
                                                           'Remote_Work', 'Tech_Company', 'Benefits', 'Care_Options',
                                                           'Wellness_Program', 'Seek_Help', 'Anonymity',
                                                           'Medical_Leave', 'Mental_Health_Consequence',
                                                           'Physical_Health_Consequence', 'Coworkers', 'Supervisor',
                                                           'Mental_Health_Interview', 'Physical_Health_Interview',
                                                           'Mental_VS_Physical', 'Observed_Consequence','Treatment','Self_Employed']),
    
    ('iterative', IterativeImputer(max_iter = 10, random_state = 0), ['Age'])])

X = df.drop('Work_Interfere', axis = 1)
y = df['Work_Interfere']

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y,test_size = 0.3,random_state = 2222)
tree = DecisionTreeClassifier(random_state = 2222)
nb = naive_bayes.GaussianNB()
knn = KNeighborsClassifier()


knn_pipe = Pipeline([('transformer', transformer), ('knn', knn)])
tree_pipe = Pipeline([('transformer', transformer), ('tree', tree)])
nb_pipe = Pipeline([('transformer', transformer), ('nb', nb)])
tree_pipe_cv = model_evaluation(tree_pipe, 'recall')
nb_cv= model_evaluation(nb_pipe, 'recall')
knn_pipe_cv = model_evaluation(knn_pipe, 'recall')


for model in [ tree_pipe,nb_pipe,knn_pipe]:
    model.fit(X_train, y_train)

score_cv = [ tree_pipe_cv.round(10),nb_cv.round(10),knn_pipe_cv.round(10)]
score_mean = [ tree_pipe_cv.mean(),nb_cv.mean(),knn_pipe_cv.mean()]
score_std = [ tree_pipe_cv.std(),nb_cv.std(),knn_pipe_cv.std()]
score_recall_score = [recall_score(y_test, tree_pipe.predict(X_test),pos_label='positive',average='micro'),recall_score(y_test, nb_pipe.predict(X_test),pos_label='positive',average='micro'),  recall_score(y_test, knn_pipe.predict(X_test))]
method_name = [ 'Decision Tree Classifier', 'Naive Bayes Classifier','KNN Classifier']
cv_summary = pd.DataFrame({
    'method': method_name,
    'cv score': score_cv,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})

In [None]:
cv_summary

Unnamed: 0,method,cv score,mean score,std score,recall score
0,Decision Tree Classifier,"[0.7798165138, 0.8256880734, 0.8440366972, 0.8...",0.827302,0.033687,0.785235
1,Naive Bayes Classifier,"[0.5963302752, 0.5596330275, 0.4311926606, 0.4...",0.525773,0.057294,0.597315
2,KNN Classifier,"[0.9266055046, 0.8990825688, 0.9266055046, 0.8...",0.917312,0.015211,0.918803


In [None]:
# here each tuple is unique and can be identifable
df1 = np.array(df)

In [None]:
from tqdm import tqdm

class K_anonymity:


  def __init__(self ,df, column_dict,k_val,categorical_dict,categorical_indices):
    self.column_dict=column_dict
    self.k= k_val
    self.categorical_dict=categorical_dict
    self.df= df
    self.categorical_indices=categorical_indices


  def span(self,dim,partition):

    print("*********************** span function")
    
    print("dimension",dim)
    print("categorical", self.categorical_indices)
    total_span= dict()

    for each_col_in_list in dim:
      df_p=self.df[partition,each_col_in_list]
      print("\ncolumn",each_col_in_list,"\ndfp",df_p)
      if each_col_in_list in  self.categorical_indices:
        column_span= len(np.unique(df_p))
      else:
        column_span=round(np.max(df_p)-np.min(df_p),2)
      total_span[each_col_in_list]= column_span
      print("total spans", total_span)

    return total_span


  def split(self, dim, partition, mode):
    
      print("***************** split function")
      print("\npartition",partition, "\nmode", mode, "\ndim",dim)
      df_partition= self.df[partition, dim]
      print(df_partition)
      unique_col_val= list(np.unique(df_partition))
      print("\nunique values per column",unique_col_val,len(unique_col_val))
      len_unique_col_val= len(unique_col_val)
      
      
      lhs_list=[]
      rhs_list=[]
      lhs=[]
      rhs=[]
      x=[]
      y=[]
      
      if mode=='strict':
        if dim in  self.categorical_indices:
          left_value= unique_col_val[:len_unique_col_val//2]
          right_value= unique_col_val[len_unique_col_val//2:]
          left_side= list(list(np.where(np.isin(df_partition,left_value)))[0])
          right_side= list(list(np.where(np.isin(df_partition,right_value)))[0])
          for i in left_side:
            lhs_list.append(partition[i])
          for j in right_side:
            rhs_list.append(partition[j])
        else: #numeric
            median= np.median(df_partition)
            print("median",median)
            print("lhs_v",list(list(np.where(df_partition < median))[0]))
            print("rhs_v",list(list(np.where(df_partition >= median))[0]))
            lhs_v = list(list(np.where(df_partition < median))[0])
            rhs_v = list(list(np.where(df_partition >= median))[0])
            print("partition",partition)
            lhs = [partition[i] for i in lhs_v]
            rhs = [partition[i] for i in rhs_v]
            
      elif mode=='relaxed':
        if dim in  self.categorical_indices:
          print("categ")
          left_value= unique_col_val[:len_unique_col_val//2]
          right_value= unique_col_val[len_unique_col_val//2:]
          left_side= list(list(np.where(np.isin(df_partition,left_value)))[0])
          right_side= list(list(np.where(np.isin(df_partition,right_value)))[0])

          for i in left_side:
            lhs_list.append(partition[i])
          for j in right_side:
            rhs_list.append(partition[j])
          difference= len(lhs_list)-len(rhs_list)
          if difference==0:
            pass
          elif difference<0: #rhs to lhs , rhs has more
            lhs1= rhs_list[:(np.abs(difference)//2)]
            rhs_list=rhs_list[(np.abs(difference)//2):] 
            lhs_list=np.concatenate((lhs_list,lhs1))

          else: #lhs to rhs, lhs is more
            rhs1= lhs_list[-(difference//2):]
            lhs= lhs_list[:-(difference//2)]
            rhs= np.concatenate((rhs_list,rhs1))
            
        else: #numeric
            median= np.median(df_partition)
            print("numeric")
            print("median",median)
            print("first list lehs_v",list(np.where(df_partition < median)))
            print("lhs_v",list(list(np.where(df_partition < median))[0]))
            print("rhs_v",list(list(np.where(df_partition >= median))[0]))
            print("median_v",list(list(np.where(df_partition == median))[0]))
             
            lhs_v = list(list(np.where(df_partition < median))[0])
            rhs_v = list(list(np.where(df_partition > median))[0])
            median_v = list(list(np.where(df_partition == median))[0])
            print("partition",partition)
            left_partition = [partition[i] for i in lhs_v]
            print("left_partition",left_partition)
            right_partition = [partition[i] for i in rhs_v]
            print("right_partition",right_partition)
            median_p = [partition[i] for i in median_v]
            print("median_p",median_p)
            difference = len(left_partition)-len(right_partition)
            print("diff",difference)
            if difference<0:
              lhs_m = np.random.choice(median_p, size=np.abs(difference), replace=False)
              print("left median",lhs_m)
              median_obt = [i for i in median_p if i not in lhs_m]
              print("median obtained",median_obt)
              left_partition = np.concatenate((left_partition,lhs_m))
              print("left_partition",left_partition)
            else: 
              rhs_m = np.random.choice(median_p, size=np.abs(difference), replace=False)
              print("right median",rhs_m)
              median_obt = [i for i in median_p if i not in rhs_m]
              print("median obtained",median_obt)
              right_partition = np.concatenate((right_partition,rhs_m))
              print("right_partition",right_partition)
            
            left_m = np.random.choice(median_obt, size=(len(median_obt)//2), replace=False) 
            print("lhs median",left_m)
            right_m = [i for i in median_obt if i not in left_m]
            print("rhs median",right_m)
            lhs = np.concatenate((left_partition,left_m))
            print("lhs",lhs)
            rhs = np.concatenate((right_partition,right_m))
            print("rhs",rhs)
            
            for i in lhs:
              x.append(int(i))

            for j in rhs:
              y.append(int(j))

           

         
      return x,y        

  
  def partitioning(self,dimensions, k, mode):

        print("*********************** part function")
        parts = []
        
        current_part = [[x for x in range(len(self.df))]] 
        print("working part", current_part)
        while len(current_part) > 0: 
          
          part = current_part[0] 
          print("remove first element", current_part[0])
          current_part = current_part[1:] 
          print("wp after removing first part", current_part)

          if len(part) < 2*k:
            parts.append(part) 
            print("<2k", parts)
          
          else:
            print(">2k",2*k)
            print("going to span func")
            print("dimensions",dimensions)
            range_span = self.span(dimensions, part) 
            ordered_span_cols = sorted(range_span.items(), key=lambda x:x[1], reverse=True) 
            print("\nordered_span_cols",ordered_span_cols)
            
            for col, _ in ordered_span_cols: 
              print("len before split",len(part))
              print("going to split function")
              lhs, rhs = self.split( col, part, mode) 
              print("lhs",lhs)
              print("len lhs",len(lhs))
              print("rhs",rhs)
              print("len rhs",len(rhs))
              if len(lhs) >= k and len(rhs) >= k: 
                  current_part.append(lhs) 
                  current_part.append(rhs) 
                  print("current_part",current_part)
                  break 
            
            else: 

              parts.append(part) 

        return parts 



  def transform(self,dim,partition,target,mode='range'):
    transformed_df=[]
    
    
    print("partition",partition,"dim",dim,"target",target)
    for i,j in tqdm(enumerate(partition)): #i is the timestamp for partition
        agg_partition=[]
        print("i",i,"j",j)
        partition=self.df[j]
        print("self.df[j]",self.df[j])
        
        for col in dim:
            print("col",col)
            if col in self.categorical_indices:
                val= list(np.unique(partition[:,col]))
                agg_partition.append(','.join(val))
                print("agg_partition",agg_partition)
            else:
            
                if mode=='range':
                    col_min=np.min(partition[:,col])
                    col_max=np.max(partition[:,col])
                    print("col_min",col_min,"col_max",col_max)
                    if col_min== col_max:
                        agg_partition.append(col_min)
                        print("agg_partition1",agg_partition)
                    else:
                        agg_partition.append('{}-{}'.format(col_min,col_max))
                        print("agg_partition2",agg_partition)
                elif mode=='mean':
                        agg_partition.append(np.mean(partition[:,col]))
                        print("agg_partition3",agg_partition)
                        
        for k in range(len(j)): # j length of each cluster
            print("k",k)
            print("target",target)
            print(" self.df[j[k],target]", self.df[j[k],target])
            target_val = self.df[j[k],target][0]
            print("target_val",target_val)
            transformed_df.append([int(j[k])]+agg_partition+[target_val])
            print("transformed_df",transformed_df)

    
    trans = pd.DataFrame(transformed_df)
    print("trans\n",trans)
    df_mod = trans.sort_values(trans.columns[0]) 
    print("df_mod",df_mod)
    df_mod = df_mod.iloc[:,1:] 
    print("df_mod",df_mod)
    return np.array(df_mod) 
      
#Discernability Metric
#based on how many tuples are indistinguishable from it
  def discernability(self,partition,k):
    total_sum=0.0
    partition_len= len(partition)
    print("discernability")
    #print(partition_len)
    for i in range(partition_len):
      #print("total_sum", total_sum)
      #print("len(partition[i])",len(partition[i]))
      #print("len(partition[i])**20",len(partition[i])**2)
      total_sum = total_sum+ len(partition[i])**2
      #print(total_sum)
      
      optimal_d = [k*len(self.df)] 
      print("optimal",optimal_d)

      worst_d = [2*k*len(self.df)]
      print("worst",worst_d)
    return total_sum
    
#Average equivalence class metric
  def avg_equi_class(self,n_part,total_tuples,k):
    print("Average equivalence class metric")
    val= (total_tuples/n_part)/(k*1.0)

    return val

In [None]:
QI_col = ['Age','Gender', 'Family_History', 'Employee_Numbers',
                                                           'Remote_Work', 'Tech_Company', 'Benefits', 'Care_Options',
                                                           'Wellness_Program', 'Seek_Help', 'Anonymity',
                                                           'Medical_Leave', 'Mental_Health_Consequence',
                                                           'Physical_Health_Consequence', 'Coworkers', 'Supervisor',
                                                           'Mental_Health_Interview', 'Physical_Health_Interview',
                                                           'Mental_VS_Physical', 'Observed_Consequence','Treatment','Self_Employed']
QI = {k: v for k, v in column_dict.items() if v in QI_col} 
QI_index = [k for k, v in column_dict.items() if v in QI_col] 

target_col = ['Work_Interfere']
target_attr = {k: v for k, v in column_dict.items() if v in target_col} 
target_attr_idx = [k for k, v in column_dict.items() if v in target_col] 

In [None]:
len_df= len(df1)
k_list=[5]
df_list=list()
cluster_len_list= list()
new_df_list=list()
discernability_list=list()
avg_equi_class_list= list()
df2=pd.DataFrame()
for k in k_list:
  print("*********************************************************************************************************")
  clusters= ""
  k_val= K_anonymity(df1,column_dict,k,categorical_dict,categorical_indices)
  clusters=k_val.partitioning(QI_index,k, 'relaxed')
  df_list.append(k_val.transform(QI_index,clusters, target_attr_idx))
  cluster_len_list.append(len(clusters))
  discernability_list.append(k_val.discernability(clusters,k))
  avg_equi_class_list.append(k_val.avg_equi_class(len_df,len(clusters),k))

for data in df_list:
  
  new_df_list.append(pd.DataFrame(data))
  df2= pd.DataFrame(data)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
len lhs 0
rhs []
len rhs 0
len before split 15
going to split function
***************** split function

partition [111, 182, 362, 446, 583, 716, 752, 400, 607, 830, 596, 680, 419, 727, 627] 
mode relaxed 
dim 19
['No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Maybe' 'No' 'No'
 'No']

unique values per column ['Maybe', 'No'] 2
categ
lhs []
len lhs 0
rhs []
len rhs 0
len before split 15
going to split function
***************** split function

partition [111, 182, 362, 446, 583, 716, 752, 400, 607, 830, 596, 680, 419, 727, 627] 
mode relaxed 
dim 20
['No' 'Maybe' 'Maybe' 'No' 'No' 'Maybe' 'Maybe' 'No' 'Maybe' 'Maybe' 'No'
 'Maybe' 'No' 'No' 'No']

unique values per column ['Maybe', 'No'] 2
categ
lhs []
len lhs 0
rhs []
len rhs 0
len before split 15
going to split function
***************** split function

partition [111, 182, 362, 446, 583, 716, 752, 400, 607, 830, 596, 680, 419, 727, 627] 
mode relaxed 
dim 22
['

0it [00:00, ?it/s]

i 0 j [38, 95, 219, 223, 377, 764, 849]
self.df[j] [[18 'Male' 'No' 'No' 'No' 1 '6-25' 'No' 'Yes' 'No' 'Not sure' 'No' 'No'
  "Don't know" 'Somewhat difficult' 'Yes' 'Maybe' 'No' 'Some of them'
  'No' 'No' 'No' 'No']
 [18 'Male' 'No' 'No' 'Yes' 1 '1-5' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'Yes'
  'Very easy' 'No' 'No' 'Some of them' 'No' 'No' 'No' "Don't know" 'No']
 [18 'Male' 'No' 'No' 'Yes' 1 '6-25' 'No' 'Yes' "Don't know" 'No' 'No'
  "Don't know" "Don't know" "Don't know" 'Yes' 'No' 'No' 'No' 'No'
  'Maybe' "Don't know" 'No']
 [18 'Male' 'No' 'No' 'No' 0 '26-100' 'No' 'Yes' "Don't know" 'Not sure'
  "Don't know" "Don't know" "Don't know" 'Very easy' 'No' 'No'
  'Some of them' 'Some of them' 'No' 'Maybe' 'Yes' 'No']
 [18 'Female' 'No' 'Yes' 'Yes' 1 '1-5' 'Yes' 'Yes' "Don't know"
  'Not sure' "Don't know" "Don't know" "Don't know" "Don't know" 'Maybe'
  'Maybe' 'Some of them' 'Some of them' 'Maybe' 'Maybe' "Don't know" 'No']
 [8 'Other' 'Yes' 'Yes' 'Yes' 1 '1-5' 'Yes' 'Yes' 'Yes' 'Yes' 'Y

4it [00:00, 12.53it/s]

[5]
 self.df[j[k],target] [0]
target_val 0
transformed_df [[38, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes", "Don't know,No,Yes", "Don't know,Yes", "Don't know,Somewhat difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No,Yes', 'No,Some of them,Yes', 'No,Some of them,Yes', 'Maybe,No,Yes', 'Maybe,No,Yes', "Don't know,No,Yes", 'No,Yes', 1], [95, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes", "Don't know,No,Yes", "Don't know,Yes", "Don't know,Somewhat difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No,Yes', 'No,Some of them,Yes', 'No,Some of them,Yes', 'Maybe,No,Yes', 'Maybe,No,Yes', "Don't know,No,Yes", 'No,Yes', 1], [219, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes", "Do

6it [00:00, 13.99it/s]

 19
agg_partition ['21-22', 'Female,Male', 'No,Yes', 'No,Yes', 'No,Yes', '6-25,More than 1000', 'No,Yes', 'No,Yes', "Don't know,No", 'No,Not sure', "Don't know,No,Yes", "Don't know,No,Yes", "Don't know,No", "Don't know,Somewhat easy,Very difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No,Yes', 'No,Some of them', 'No,Some of them,Yes', 'No']
col 20
agg_partition ['21-22', 'Female,Male', 'No,Yes', 'No,Yes', 'No,Yes', '6-25,More than 1000', 'No,Yes', 'No,Yes', "Don't know,No", 'No,Not sure', "Don't know,No,Yes", "Don't know,No,Yes", "Don't know,No", "Don't know,Somewhat easy,Very difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No,Yes', 'No,Some of them', 'No,Some of them,Yes', 'No', 'Maybe,No,Yes']
col 21
agg_partition ['21-22', 'Female,Male', 'No,Yes', 'No,Yes', 'No,Yes', '6-25,More than 1000', 'No,Yes', 'No,Yes', "Don't know,No", 'No,Not sure', "Don't know,No,Yes", "Don't know,No,Yes", "Don't know,No", "Don't know,Somewhat easy,Very difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No,Yes', 'No,So

8it [00:00, 12.17it/s]

col 21
agg_partition [23, 'Female,Male', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,100-500,26-100,6-25,More than 1000', 'No', 'No,Yes', "Don't know,No,Yes", 'No,Not sure,Yes', 'No,Yes', "Don't know,No,Yes", "Don't know", "Don't know,Somewhat easy,Very difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No', 'No,Some of them,Yes', 'No,Some of them,Yes', 'Maybe,No', 'Maybe,No', "Don't know,No,Yes"]
col 22
agg_partition [23, 'Female,Male', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,100-500,26-100,6-25,More than 1000', 'No', 'No,Yes', "Don't know,No,Yes", 'No,Not sure,Yes', 'No,Yes', "Don't know,No,Yes", "Don't know", "Don't know,Somewhat easy,Very difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No', 'No,Some of them,Yes', 'No,Some of them,Yes', 'Maybe,No', 'Maybe,No', "Don't know,No,Yes", 'No,Yes']
k 0
target [5]
 self.df[j[k],target] [0]
target_val 0
transformed_df [[38, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't k

11it [00:00, 12.81it/s]


col 6
agg_partition [23, 'Female,Male', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000']
col 7
agg_partition [23, 'Female,Male', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No']
col 8
agg_partition [23, 'Female,Male', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No', 'No,Yes']
col 9
agg_partition [23, 'Female,Male', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No', 'No,Yes', "Don't know,No,Yes"]
col 10
agg_partition [23, 'Female,Male', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No', 'No,Yes', "Don't know,No,Yes", 'No,Not sure,Yes']
col 11
agg_partition [23, 'Female,Male', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No', 'No,Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes"]
col 12
agg_partition [23, 'Female,Male', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No', 'No,Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes",

14it [00:01, 11.35it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

32it [00:02, 10.94it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

34it [00:03,  8.98it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 

 [1]
target_val 1
transformed_df [[38, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes", "Don't know,No,Yes", "Don't know,Yes", "Don't know,Somewhat difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No,Yes', 'No,Some of them,Yes', 'No,Some of them,Yes', 'Maybe,No,Yes', 'Maybe,No,Yes', "Don't know,No,Yes", 'No,Yes', 1], [95, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes", "Don't know,No,Yes", "Don't know,Yes", "Don't know,Somewhat difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No,Yes', 'No,Some of them,Yes', 'No,Some of them,Yes', 'Maybe,No,Yes', 'Maybe,No,Yes', "Don't know,No,Yes", 'No,Yes', 1], [219, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes", "Don't know,No,Yes", "Don't 

51it [00:04,  8.82it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



agg_partition [32, 'Female,Male,Other', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000']
col 7
agg_partition [32, 'Female,Male,Other', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No,Yes']
col 8
agg_partition [32, 'Female,Male,Other', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No,Yes', 'Yes']
col 9
agg_partition [32, 'Female,Male,Other', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No,Yes', 'Yes', 'No,Yes']
col 10
agg_partition [32, 'Female,Male,Other', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No,Yes', 'Yes', 'No,Yes', 'No,Not sure,Yes']
col 11
agg_partition [32, 'Female,Male,Other', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No,Yes', 'Yes', 'No,Yes', 'No,Not sure,Yes', "Don't know,No"]
col 12
agg_partition [32, 'Female,Male,Other', 'No', 'No,Yes', 'No,Yes', '100-500,26-100,6-25,More than 1000', 'No,Yes', 'Yes', 'No,Yes', 'No,Not sure,Yes', "Don't know,No", "Don't 

74it [00:07,  7.53it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

78it [00:08,  8.64it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

79it [00:08,  6.17it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 


k 3
target [5]
 self.df[j[k],target] [1]
target_val 1
transformed_df [[38, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes", "Don't know,No,Yes", "Don't know,Yes", "Don't know,Somewhat difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No,Yes', 'No,Some of them,Yes', 'No,Some of them,Yes', 'Maybe,No,Yes', 'Maybe,No,Yes', "Don't know,No,Yes", 'No,Yes', 1], [95, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,No,Yes", "Don't know,No,Yes", "Don't know,Yes", "Don't know,Somewhat difficult,Very easy", 'Maybe,No,Yes', 'Maybe,No,Yes', 'No,Some of them,Yes', 'No,Some of them,Yes', 'Maybe,No,Yes', 'Maybe,No,Yes', "Don't know,No,Yes", 'No,Yes', 1], [219, '8-18', 'Female,Male,Other', 'No,Yes', 'No,Yes', 'No,Yes', '1-5,26-100,6-25', 'No,Yes', 'Yes', "Don't know,No,Yes", 'No,Not sure,Yes', "Don't know,

117it [00:15,  5.55it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

118it [00:15,  4.18it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

120it [00:16,  5.79it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.

worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal [4955]
worst [9910]
optimal

In [None]:
for i in new_df_list:

  i.columns=['Age','Gender','Self_Employed', 'Family_History','Treatment', 'Employee_Numbers',
                                                           'Remote_Work', 'Tech_Company', 'Benefits', 'Care_Options',
                                                           'Wellness_Program', 'Seek_Help', 'Anonymity',
                                                           'Medical_Leave', 'Mental_Health_Consequence',
                                                           'Physical_Health_Consequence', 'Coworkers', 'Supervisor',
                                                           'Mental_Health_Interview', 'Physical_Health_Interview',
                                                           'Mental_VS_Physical', 'Observed_Consequence','Work_Interfere']

In [None]:
new_df_list

[       Age             Gender Self_Employed Family_History Treatment  \
 0       37        Female,Male        No,Yes         No,Yes    No,Yes   
 1    44-45               Male        No,Yes         No,Yes    No,Yes   
 2       32        Female,Male        No,Yes         No,Yes    No,Yes   
 3       31        Female,Male        No,Yes         No,Yes    No,Yes   
 4    30-31         Male,Other        No,Yes         No,Yes    No,Yes   
 ..     ...                ...           ...            ...       ...   
 986     29        Female,Male            No         No,Yes    No,Yes   
 987  35-36               Male        No,Yes         No,Yes    No,Yes   
 988     32        Female,Male        No,Yes         No,Yes    No,Yes   
 989  33-34        Female,Male        No,Yes         No,Yes    No,Yes   
 990     25  Female,Male,Other        No,Yes         No,Yes    No,Yes   
 
                                     Employee_Numbers Remote_Work Tech_Company  \
 0    1-5,100-500,26-100,500-1000,6-25,M

In [None]:
cluster_len_list

[128]

In [None]:
discernability_list

[7697.0]

In [None]:
avg_equi_class_list

[0.025832492431886983]

In [None]:
def model_evaluation1(model, metric):
    model_cv = cross_val_score(model, X_train1, y_train1, cv = StratifiedKFold(n_splits = 5), scoring = metric)
    return model_cv

In [None]:
df2.columns=['Age','Gender','Self_Employed', 'Family_History','Treatment', 'Employee_Numbers',
                                                           'Remote_Work', 'Tech_Company', 'Benefits', 'Care_Options',
                                                           'Wellness_Program', 'Seek_Help', 'Anonymity',
                                                           'Medical_Leave', 'Mental_Health_Consequence',
                                                           'Physical_Health_Consequence', 'Coworkers', 'Supervisor',
                                                           'Mental_Health_Interview', 'Physical_Health_Interview',
                                                           'Mental_VS_Physical', 'Observed_Consequence','Work_Interfere']

In [None]:
df2.head()

Unnamed: 0,Age,Gender,Self_Employed,Family_History,Treatment,Employee_Numbers,Remote_Work,Tech_Company,Benefits,Care_Options,...,Medical_Leave,Mental_Health_Consequence,Physical_Health_Consequence,Coworkers,Supervisor,Mental_Health_Interview,Physical_Health_Interview,Mental_VS_Physical,Observed_Consequence,Work_Interfere
0,37,"Female,Male","No,Yes","No,Yes","No,Yes","1-5,100-500,26-100,500-1000,6-25,More than 1000","No,Yes","No,Yes","Don't know,No,Yes","No,Not sure,Yes",...,"Don't know,Somewhat easy,Very easy","Maybe,No,Yes","Maybe,No,Yes","No,Some of them,Yes","No,Some of them,Yes",No,"Maybe,No,Yes","Don't know,No,Yes","No,Yes",1
1,44-45,Male,"No,Yes","No,Yes","No,Yes","1-5,100-500,26-100,6-25,More than 1000","No,Yes","No,Yes","Don't know,No,Yes","No,Not sure,Yes",...,"Don't know,Very difficult","Maybe,No,Yes","Maybe,No","No,Some of them,Yes","No,Some of them",No,"Maybe,No","Don't know,No","No,Yes",1
2,32,"Female,Male","No,Yes","No,Yes","No,Yes","1-5,100-500,26-100,500-1000,6-25","No,Yes",Yes,"Don't know,No,Yes","No,Not sure,Yes",...,"Don't know,Somewhat difficult,Very difficult,V...","Maybe,No,Yes","Maybe,No","No,Some of them,Yes","No,Some of them,Yes","Maybe,No,Yes","Maybe,No,Yes","Don't know,No,Yes",No,1
3,31,"Female,Male","No,Yes","No,Yes","No,Yes","1-5,100-500,26-100,6-25,More than 1000","No,Yes","No,Yes","No,Yes","No,Not sure,Yes",...,"Don't know,Somewhat difficult,Somewhat easy,Ve...","Maybe,No,Yes","Maybe,No,Yes","No,Some of them,Yes","No,Some of them,Yes","Maybe,No,Yes","Maybe,No,Yes","Don't know,No,Yes","No,Yes",1
4,30-31,"Male,Other","No,Yes","No,Yes","No,Yes","100-500,26-100,6-25,More than 1000","No,Yes",Yes,"Don't know,No,Yes","No,Not sure,Yes",...,"Don't know,Somewhat difficult,Somewhat easy,Ve...","Maybe,No,Yes","Maybe,No","No,Some of them,Yes","No,Some of them,Yes","Maybe,No,Yes","Maybe,No,Yes","Don't know,No,Yes",No,0


In [None]:
transformer1 = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), ['Age','Gender', 'Family_History', 'Employee_Numbers',
                                                           'Remote_Work', 'Tech_Company', 'Benefits', 'Care_Options',
                                                           'Wellness_Program', 'Seek_Help', 'Anonymity',
                                                           'Medical_Leave', 'Mental_Health_Consequence',
                                                           'Physical_Health_Consequence', 'Coworkers', 'Supervisor',
                                                           'Mental_Health_Interview', 'Physical_Health_Interview',
                                                           'Mental_VS_Physical', 'Observed_Consequence','Treatment','Self_Employed'])],  remainder = 'passthrough',sparse_threshold=0)



In [None]:
df2.dtypes

Age                            object
Gender                         object
Self_Employed                  object
Family_History                 object
Treatment                      object
Employee_Numbers               object
Remote_Work                    object
Tech_Company                   object
Benefits                       object
Care_Options                   object
Wellness_Program               object
Seek_Help                      object
Anonymity                      object
Medical_Leave                  object
Mental_Health_Consequence      object
Physical_Health_Consequence    object
Coworkers                      object
Supervisor                     object
Mental_Health_Interview        object
Physical_Health_Interview      object
Mental_VS_Physical             object
Observed_Consequence           object
Work_Interfere                 object
dtype: object

In [None]:
for e in df2.columns:
    df2[e]=df2[e].astype(str)

df2["Work_Interfere"] = pd.to_numeric(df2["Work_Interfere"])

In [None]:
X1 = df2.drop('Work_Interfere', axis = 1)
y1 = df2['Work_Interfere']

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y1,stratify = y1,test_size = 0.3,random_state = 2222)
tree1 = DecisionTreeClassifier(random_state = 2222)
nb1 = naive_bayes.GaussianNB()
knn1 = KNeighborsClassifier()


knn_pipe1 = Pipeline([('transformer', transformer1), ('knn', knn1)])
tree_pipe1 = Pipeline([('transformer', transformer1), ('tree', tree1)])
nb_pipe1 = Pipeline([('transformer', transformer1), ('nb', nb1)])



nb_cv1= model_evaluation1(nb_pipe1, 'recall')
knn_pipe_cv1 = model_evaluation1(knn_pipe1, 'recall')
tree_pipe_cv1 = model_evaluation1(tree_pipe1, 'recall')

for model in [ tree_pipe1,nb_pipe1,knn_pipe1]:
    model.fit(X_train1, y_train1)



score_cv1 = [ tree_pipe_cv1.round(10),nb_cv1.round(10),knn_pipe_cv1.round(10)]
score_mean1 = [ tree_pipe_cv1.mean(),nb_cv1.mean(),knn_pipe_cv1.mean()]
score_std1 = [ tree_pipe_cv1.std(),nb_cv1.std(),knn_pipe_cv1.std()]
score_recall_score1 = [recall_score(y_test1, tree_pipe1.predict(X_test1),pos_label='positive',average='micro'),recall_score(y_test1, nb_pipe1.predict(X_test1),pos_label='positive',average='micro'),  recall_score(y_test1, knn_pipe1.predict(X_test1))]
method_name1 = [ 'Decision Tree Classifier', 'Naive Bayes Classifier','KNN Classifier']
cv_summary1 = pd.DataFrame({
    'method': method_name1,
    'cv score': score_cv1,
    'mean score': score_mean1,
    'std score': score_std1,
    'recall score': score_recall_score1
})


cv_summary1

Unnamed: 0,method,cv score,mean score,std score,recall score
0,Decision Tree Classifier,"[0.7981651376, 0.8348623853, 0.7889908257, 0.7...",0.814407,0.024471,0.711409
1,Naive Bayes Classifier,"[0.1834862385, 0.119266055, 0.1009174312, 0.14...",0.130462,0.031289,0.278523
2,KNN Classifier,"[0.8899082569, 0.9266055046, 0.9357798165, 0.8...",0.915477,0.021128,0.970085


In [None]:
y1.unique()

array([1, 0])