In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import matplotlib
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, plot_roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn import naive_bayes
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

In [None]:
col_names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship',
         'race','gender','capital-gain','capital-loss','hours-per-week','native-country','income']
df1 = pd.read_csv("https://github.com/Anjana-premkumar/K_anonymity/blob/1fe78e94edf002d530bc2138280058bda88054b6/adult.all%20(1).txt?raw=true", sep=",", header=None, names=col_names, index_col=False, engine='python')
df1['income']=df1['income'].replace(['<=50k','>50k'],[0,1])
df1.drop(columns=['fnlwgt', 'education-num', 'capital-gain', 'capital-loss'], inplace = True)
df1=df1.head(2000)
df1 = df1[df1['native-country'].notna()]
df1 = df1[df1['workclass'] != '-1']
df1 = df1[df1['occupation'] != '-1']
df1 = df1[df1['native-country'] != '-1']


In [None]:
df1.dtypes

age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
gender            object
hours-per-week     int64
native-country    object
income             int64
dtype: object

In [None]:

column_names=['age','workclass','education','marital-status','occupation','relationship',
         'race','gender','hours-per-week','native-country','income']
categorical=['workclass','education','marital-status','occupation','relationship',
         'race','gender','native-country','income']
numeric=["age","hours-per-week",'income']
#encapsulating columns with indices in a dictionary
value = zip([x for x in range(11)], column_names)
column_dict = dict(value)


categorical_dict = {k: v for k, v in column_dict.items() if v in categorical}
categorical_indices = [k for k, v in categorical_dict.items()]

#Find the number of tuples that have the same attribute values
m = df1.groupby(df1.columns.tolist(),as_index=False).size()['size'].max() 
print(m)



3


In [None]:
def model_evaluation(model, metric):
    model_cv = cross_val_score(model, X_train, y_train, cv = StratifiedKFold(n_splits = 5), scoring = metric)
    return model_cv

In [None]:


transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), ['workclass', 'education', 'marital-status',
                                                           'occupation', 'relationship', 'race', 'gender','native-country','age','hours-per-week'])
            ],  remainder = 'passthrough',sparse_threshold=0)



X = df1.drop('income', axis = 1)
y = df1['income']


X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y,test_size = 0.3,random_state = 2222)
tree = DecisionTreeClassifier(random_state = 2222)
nb = naive_bayes.GaussianNB()
knn = KNeighborsClassifier()


knn_pipe = Pipeline([('transformer', transformer), ('knn', knn)])
tree_pipe = Pipeline([('transformer', transformer), ('tree', tree)])
nb_pipe = Pipeline([('transformer', transformer), ('nb', nb)])
tree_pipe_cv = model_evaluation(tree_pipe, 'recall')
nb_cv= model_evaluation(nb_pipe, 'recall')
knn_pipe_cv = model_evaluation(knn_pipe, 'recall')


for model in [ tree_pipe,nb_pipe,knn_pipe]:
    model.fit(X_train, y_train)

score_cv = [ tree_pipe_cv.round(10),nb_cv.round(10),knn_pipe_cv.round(10)]
score_mean = [ tree_pipe_cv.mean(),nb_cv.mean(),knn_pipe_cv.mean()]
score_std = [ tree_pipe_cv.std(),nb_cv.std(),knn_pipe_cv.std()]
score_recall_score = [recall_score(y_test, tree_pipe.predict(X_test),pos_label='positive',average='micro'),recall_score(y_test, nb_pipe.predict(X_test),pos_label='positive',average='micro'),  recall_score(y_test, knn_pipe.predict(X_test))]
method_name = [ 'Decision Tree Classifier', 'Naive Bayes Classifier','KNN Classifier']
cv_summary = pd.DataFrame({
    'method': method_name,
    'cv score': score_cv,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})

In [None]:
len(X_test)

553

In [None]:
y_test

1666    1
768     0
467     0
191     0
1292    0
       ..
1327    1
1014    0
440     0
163     0
155     0
Name: income, Length: 553, dtype: int64

In [None]:
cv_summary

Unnamed: 0,method,cv score,mean score,std score,recall score
0,Decision Tree Classifier,"[0.5303030303, 0.4242424242, 0.5223880597, 0.5...",0.484758,0.044104,0.766727
1,Naive Bayes Classifier,"[0.8636363636, 0.9393939394, 0.9104477612, 0.9...",0.921574,0.035113,0.464738
2,KNN Classifier,"[0.5151515152, 0.4545454545, 0.5373134328, 0.5...",0.50588,0.028342,0.524476


In [None]:
# here each tuple is unique and can be identifable
df1 = np.array(df1)

In [None]:
from tqdm import tqdm

class K_anonymity:


  def __init__(self ,df, column_dict,k_val,categorical_dict,categorical_indices):
    self.column_dict=column_dict
    self.k= k_val
    self.categorical_dict=categorical_dict
    self.df= df
    self.categorical_indices=categorical_indices


  def span(self,dim,partition):

    print("*********************** span function")
    
    print("dimension",dim)
    print("categorical", self.categorical_indices)
    total_span= dict()

    for each_col_in_list in dim:
      df_p=self.df[partition,each_col_in_list]
      print("\ncolumn",each_col_in_list,"\ndfp",df_p)
      if each_col_in_list in  self.categorical_indices:
        column_span= len(np.unique(df_p))
      else:
        column_span=round(np.max(df_p)-np.min(df_p),2)
      total_span[each_col_in_list]= column_span
      print("total spans", total_span)

    return total_span


  def split(self, dim, partition, mode):
    
      print("***************** split function")
      print("\npartition",partition, "\nmode", mode, "\ndim",dim)
      df_partition= self.df[partition, dim]
      print(df_partition)
      unique_col_val= list(np.unique(df_partition))
      print("\nunique values per column",unique_col_val,len(unique_col_val))
      len_unique_col_val= len(unique_col_val)
      
      
      lhs_list=[]
      rhs_list=[]
      lhs=[]
      rhs=[]
      x=[]
      y=[]
      
      if mode=='strict':
        if dim in  self.categorical_indices:
          left_value= unique_col_val[:len_unique_col_val//2]
          right_value= unique_col_val[len_unique_col_val//2:]
          left_side= list(list(np.where(np.isin(df_partition,left_value)))[0])
          right_side= list(list(np.where(np.isin(df_partition,right_value)))[0])
          for i in left_side:
            lhs_list.append(partition[i])
          for j in right_side:
            rhs_list.append(partition[j])
        else: #numeric
            median= np.median(df_partition)
            print("median",median)
            print("lhs_v",list(list(np.where(df_partition < median))[0]))
            print("rhs_v",list(list(np.where(df_partition >= median))[0]))
            lhs_v = list(list(np.where(df_partition < median))[0])
            rhs_v = list(list(np.where(df_partition >= median))[0])
            print("partition",partition)
            lhs = [partition[i] for i in lhs_v]
            rhs = [partition[i] for i in rhs_v]
            
      elif mode=='relaxed':
        if dim in  self.categorical_indices:
          print("categ")
          left_value= unique_col_val[:len_unique_col_val//2]
          right_value= unique_col_val[len_unique_col_val//2:]
          left_side= list(list(np.where(np.isin(df_partition,left_value)))[0])
          right_side= list(list(np.where(np.isin(df_partition,right_value)))[0])

          for i in left_side:
            lhs_list.append(partition[i])
          for j in right_side:
            rhs_list.append(partition[j])
          difference= len(lhs_list)-len(rhs_list)
          if difference==0:
            pass
          elif difference<0: #rhs to lhs , rhs has more
            lhs1= rhs_list[:(np.abs(difference)//2)]
            rhs_list=rhs_list[(np.abs(difference)//2):] 
            lhs_list=np.concatenate((lhs_list,lhs1))

          else: #lhs to rhs, lhs is more
            rhs1= lhs_list[-(difference//2):]
            lhs= lhs_list[:-(difference//2)]
            rhs= np.concatenate((rhs_list,rhs1))
            
        else: #numeric
            median= np.median(df_partition)
            print("numeric")
            print("median",median)
            print("first list lehs_v",list(np.where(df_partition < median)))
            print("lhs_v",list(list(np.where(df_partition < median))[0]))
            print("rhs_v",list(list(np.where(df_partition >= median))[0]))
            print("median_v",list(list(np.where(df_partition == median))[0]))
             
            lhs_v = list(list(np.where(df_partition < median))[0])
            rhs_v = list(list(np.where(df_partition > median))[0])
            median_v = list(list(np.where(df_partition == median))[0])
            print("partition",partition)
            left_partition = [partition[i] for i in lhs_v]
            print("left_partition",left_partition)
            right_partition = [partition[i] for i in rhs_v]
            print("right_partition",right_partition)
            median_p = [partition[i] for i in median_v]
            print("median_p",median_p)
            difference = len(left_partition)-len(right_partition)
            print("diff",difference)
            if difference<0:
              lhs_m = np.random.choice(median_p, size=np.abs(difference), replace=False)
              print("left median",lhs_m)
              median_obt = [i for i in median_p if i not in lhs_m]
              print("median obtained",median_obt)
              left_partition = np.concatenate((left_partition,lhs_m))
              print("left_partition",left_partition)
            else: 
              rhs_m = np.random.choice(median_p, size=np.abs(difference), replace=False)
              print("right median",rhs_m)
              median_obt = [i for i in median_p if i not in rhs_m]
              print("median obtained",median_obt)
              right_partition = np.concatenate((right_partition,rhs_m))
              print("right_partition",right_partition)
            
            left_m = np.random.choice(median_obt, size=(len(median_obt)//2), replace=False) 
            print("lhs median",left_m)
            right_m = [i for i in median_obt if i not in left_m]
            print("rhs median",right_m)
            lhs = np.concatenate((left_partition,left_m))
            print("lhs",lhs)
            rhs = np.concatenate((right_partition,right_m))
            print("rhs",rhs)
            
            for i in lhs:
              x.append(int(i))

            for j in rhs:
              y.append(int(j))

           

         
      return x,y        

  
  def partitioning(self,dimensions, k, mode):

        print("*********************** part function")
        parts = []
        
        current_part = [[x for x in range(len(self.df))]] 
        print("working part", current_part)
        while len(current_part) > 0: 
          
          part = current_part[0] 
          print("remove first element", current_part[0])
          current_part = current_part[1:] 
          print("wp after removing first part", current_part)

          if len(part) < 2*k:
            parts.append(part) 
            print("<2k", parts)
          
          else:
            print(">2k",2*k)
            print("going to span func")
            print("dimensions",dimensions)
            range_span = self.span(dimensions, part) 
            ordered_span_cols = sorted(range_span.items(), key=lambda x:x[1], reverse=True) 
            print("\nordered_span_cols",ordered_span_cols)
            
            for col, _ in ordered_span_cols: 
              print("len before split",len(part))
              print("going to split function")
              lhs, rhs = self.split( col, part, mode) 
              print("lhs",lhs)
              print("len lhs",len(lhs))
              print("rhs",rhs)
              print("len rhs",len(rhs))
              if len(lhs) >= k and len(rhs) >= k: 
                  current_part.append(lhs) 
                  current_part.append(rhs) 
                  print("current_part",current_part)
                  break 
            
            else: 

              parts.append(part) 

        return parts 



  def transform(self,dim,partition,target,mode='range'):
    transformed_df=[]
    
    
    print("partition",partition,"dim",dim,"target",target)
    for i,j in tqdm(enumerate(partition)): #i is the timestamp for partition
        agg_partition=[]
        print("i",i,"j",j)
        partition=self.df[j]
        print("self.df[j]",self.df[j])
        
        for col in dim:
            print("col",col)
            if col in self.categorical_indices:
                val= list(np.unique(partition[:,col]))
                agg_partition.append(','.join(val))
                print("agg_partition",agg_partition)
            else:
            
                if mode=='range':
                    col_min=np.min(partition[:,col])
                    col_max=np.max(partition[:,col])
                    print("col_min",col_min,"col_max",col_max)
                    if col_min== col_max:
                        agg_partition.append(col_min)
                        print("agg_partition1",agg_partition)
                    else:
                        agg_partition.append('{}-{}'.format(col_min,col_max))
                        print("agg_partition2",agg_partition)
                elif mode=='mean':
                        agg_partition.append(np.mean(partition[:,col]))
                        print("agg_partition3",agg_partition)
                        
        for k in range(len(j)): # j length of each cluster
            print("k",k)
            print("target",target)
            print(" self.df[j[k],target]", self.df[j[k],target])
            target_val = self.df[j[k],target][0]
            print("target_val",target_val)
            transformed_df.append([int(j[k])]+agg_partition+[target_val])
            print("transformed_df",transformed_df)

    
    trans = pd.DataFrame(transformed_df)
    print("trans\n",trans)
    df_mod = trans.sort_values(trans.columns[0]) 
    print("df_mod",df_mod)
    df_mod = df_mod.iloc[:,1:] 
    print("df_mod",df_mod)
    return np.array(df_mod) 
      
#Discernability Metric
#based on how many tuples are indistinguishable from it
  def discernability(self,partition,k):
    total_sum=0.0
    partition_len= len(partition)
    print("discernability")
    #print(partition_len)
    for i in range(partition_len):
      #print("total_sum", total_sum)
      #print("len(partition[i])",len(partition[i]))
      #print("len(partition[i])**20",len(partition[i])**2)
      total_sum = total_sum+ len(partition[i])**2
      #print(total_sum)
      
      optimal_d = [k*len(self.df)] 
      print("optimal",optimal_d)

      worst_d = [2*k*len(self.df)]
      print("worst",worst_d)
    return total_sum
    
#Average equivalence class metric
  def avg_equi_class(self,n_part,total_tuples,k):
    print("Average equivalence class metric")
    val= (total_tuples/n_part)/(k*1.0)

    return val

In [None]:
QI_col = ["age","workclass","education","marital-status",'occupation','relationship',"race","gender",'hours-per-week','native-country']
QI = {k: v for k, v in column_dict.items() if v in QI_col} 
QI_index = [k for k, v in column_dict.items() if v in QI_col] 

target_col = ['income']
target_attr = {k: v for k, v in column_dict.items() if v in target_col} 
target_attr_idx = [k for k, v in column_dict.items() if v in target_col] 

In [None]:
len_df= len(df1)
k_list=[5]
df_list=list()
cluster_len_list= list()
new_df_list=list()
discernability_list=list()
avg_equi_class_list= list()
df2=pd.DataFrame()
for k in k_list:
  print("*********************************************************************************************************")
  clusters= ""
  k_val= K_anonymity(df1,column_dict,k,categorical_dict,categorical_indices)
  clusters=k_val.partitioning(QI_index,k, 'relaxed')
  df_list.append(k_val.transform(QI_index,clusters, target_attr_idx))
  cluster_len_list.append(len(clusters))
  discernability_list.append(k_val.discernability(clusters,k))
  avg_equi_class_list.append(k_val.avg_equi_class(len_df,len(clusters),k))

for data in df_list:
  
  new_df_list.append(pd.DataFrame(data))
  df2= pd.DataFrame(data)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 'Prof-specialty' 'Sales' 'Exec-managerial' 'Craft-repair' 'Craft-repair'
 'Adm-clerical' 'Sales' 'Prof-specialty' 'Transport-moving'
 'Exec-managerial']
total spans {0: 14, 1: 5, 2: 6, 3: 3, 4: 7}

column 5 
dfp ['Husband' 'Husband' 'Husband' 'Not-in-family' 'Not-in-family' 'Husband'
 'Husband' 'Husband' 'Husband' 'Not-in-family' 'Not-in-family' 'Husband'
 'Not-in-family' 'Husband']
total spans {0: 14, 1: 5, 2: 6, 3: 3, 4: 7, 5: 2}

column 6 
dfp ['White' 'White' 'White' 'White' 'White' 'White' 'White' 'White' 'White'
 'White' 'White' 'White' 'White' 'White']
total spans {0: 14, 1: 5, 2: 6, 3: 3, 4: 7, 5: 2, 6: 1}

column 7 
dfp ['Male' 'Male' 'Male' 'Female' 'Female' 'Male' 'Male' 'Male' 'Male'
 'Female' 'Female' 'Male' 'Male' 'Male']
total spans {0: 14, 1: 5, 2: 6, 3: 3, 4: 7, 5: 2, 6: 1, 7: 2}

column 8 
dfp [70 65 70 64 70 64 70 70 70 65 65 65 65 65]
total spans {0: 14, 1: 5, 2: 6, 3: 3, 4: 7, 5: 2, 6: 1, 7: 2, 8: 6}

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



wp after removing first part [[464, 449, 341, 532, 1360, 243, 1052, 264], [262, 905, 535, 788, 1632, 249, 1108], [138, 1011, 1527, 1724, 751, 130, 781], [844, 895, 1734, 1742, 1738, 1799, 928], [92, 206, 293, 765, 956, 1344, 1780, 1026], [54, 1758, 1639, 1203, 799, 547, 717], [1617, 1385, 1637, 921, 147, 1751, 1425], [720, 777, 1291, 490, 1016, 768, 220], [1003, 637, 682, 303, 298, 1029, 941], [487, 974, 1272, 1725, 1521, 1776, 1556], [1520, 1538, 1814, 1330, 708, 82, 1057], [1672, 1534, 1837, 843, 1006, 649, 677], [94, 915, 1182, 517, 783, 1620, 1260, 506], [143, 988, 1808, 726, 1587, 793, 588], [541, 1083, 266, 1159, 478, 187, 1177], [1822, 785, 949, 1453, 752, 1646, 63], [1062, 1627, 370, 620, 1149, 1406, 1628, 1333], [930, 1282, 1504, 106, 1824, 946, 103], [1155, 1468, 1500, 1476, 750, 107, 1567], [76, 543, 1828, 625, 227, 724, 1237], [210, 491, 774, 1391, 1748, 1839, 302, 1712], [183, 722, 1022, 1244, 287, 1603, 1762], [1096, 229, 664, 265, 112, 806, 1726], [632, 418, 1681, 947, 2

0it [00:00, ?it/s]

i 0 j [440, 467, 630, 1275, 1397, 1435, 1707]
self.df[j] [[17 'Private' '9th' 'Never-married' 'Other-service' 'Not-in-family'
  'White' 'Male' 6 'United-States' 0]
 [19 'Private' 'Some-college' 'Never-married' 'Protective-serv'
  'Own-child' 'White' 'Male' 8 'United-States' 0]
 [18 'State-gov' '11th' 'Never-married' 'Adm-clerical' 'Own-child'
  'White' 'Female' 5 'United-States' 0]
 [17 'Private' '11th' 'Never-married' 'Other-service' 'Own-child' 'White'
  'Male' 5 'United-States' 0]
 [17 'Local-gov' '9th' 'Never-married' 'Other-service' 'Own-child'
  'Black' 'Male' 9 'United-States' 0]
 [18 'Private' '12th' 'Never-married' 'Handlers-cleaners' 'Own-child'
  'White' 'Male' 6 'United-States' 0]
 [18 'Private' 'HS-grad' 'Never-married' 'Other-service' 'Own-child'
  'White' 'Female' 8 'United-States' 0]]
col 0
col_min 17 col_max 19
agg_partition2 ['17-19']
col 1
agg_partition ['17-19', 'Local-gov,Private,State-gov']
col 2
agg_partition ['17-19', 'Local-gov,Private,State-gov', '11th,12th,9t

4it [00:00, 37.10it/s]

1
agg_partition ['17-20', 'Private']
col 2
agg_partition ['17-20', 'Private', '11th,HS-grad,Some-college']
col 3
agg_partition ['17-20', 'Private', '11th,HS-grad,Some-college', 'Never-married']
col 4
agg_partition ['17-20', 'Private', '11th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service']
col 5
agg_partition ['17-20', 'Private', '11th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service', 'Not-in-family,Own-child']
col 6
agg_partition ['17-20', 'Private', '11th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service', 'Not-in-family,Own-child', 'Black,White']
col 7
agg_partition ['17-20', 'Private', '11th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service', 'Not-in-family,Own-child', 'Black,White', 'Female,Male']
col 8
col_min 14 col_max 16
agg_partition2 ['17-20', 'Private', '11th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaner

8it [00:00, 13.81it/s]

 ['24-28', 'Federal-gov,Private,State-gov']
col 2
agg_partition ['24-28', 'Federal-gov,Private,State-gov', 'Assoc-voc,Bachelors,Some-college']
col 3
agg_partition ['24-28', 'Federal-gov,Private,State-gov', 'Assoc-voc,Bachelors,Some-college', 'Married-civ-spouse,Never-married']
col 4
agg_partition ['24-28', 'Federal-gov,Private,State-gov', 'Assoc-voc,Bachelors,Some-college', 'Married-civ-spouse,Never-married', 'Adm-clerical,Exec-managerial,Other-service,Prof-specialty,Sales,Tech-support']
col 5
agg_partition ['24-28', 'Federal-gov,Private,State-gov', 'Assoc-voc,Bachelors,Some-college', 'Married-civ-spouse,Never-married', 'Adm-clerical,Exec-managerial,Other-service,Prof-specialty,Sales,Tech-support', 'Not-in-family,Own-child,Unmarried,Wife']
col 6
agg_partition ['24-28', 'Federal-gov,Private,State-gov', 'Assoc-voc,Bachelors,Some-college', 'Married-civ-spouse,Never-married', 'Adm-clerical,Exec-managerial,Other-service,Prof-specialty,Sales,Tech-support', 'Not-in-family,Own-child,Unmarried,

15it [00:00, 17.90it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

18it [00:01, 15.84it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

22it [00:01, 15.89it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (

transformed_df [[440, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'United-States', 0], [467, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'United-States', 0], [630, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'United-States', 0], [1275, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'Uni

38it [00:02, 14.86it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

42it [00:02, 13.37it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

49it [00:03, 15.58it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (

7
target [10]
 self.df[j[k],target] [1]
target_val 1
transformed_df [[440, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'United-States', 0], [467, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'United-States', 0], [630, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'United-States', 0], [1275, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,

96it [00:07, 11.47it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100it [00:07, 11.27it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

102it [00:07,  9.39it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0


k 7
target [10]
 self.df[j[k],target] [0]
target_val 0
transformed_df [[440, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'United-States', 0], [467, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'United-States', 0], [630, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-family,Own-child', 'Black,White', 'Female,Male', '5-9', 'United-States', 0], [1275, '17-19', 'Local-gov,Private,State-gov', '11th,12th,9th,HS-grad,Some-college', 'Never-married', 'Adm-clerical,Handlers-cleaners,Other-service,Protective-serv', 'Not-in-fami

In [None]:
for i in new_df_list:

  i.columns=['age','workclass','education','marital-status','occupation','relationship',
         'race','gender','hours-per-week','native-country','income']

In [None]:
new_df_list

[        age                                          workclass  \
 0     38-39                        Local-gov,Private,State-gov   
 1     49-55                           Private,Self-emp-not-inc   
 2     37-38                                            Private   
 3     51-53                           Private,Self-emp-not-inc   
 4     28-29                                            Private   
 ...     ...                                                ...   
 1837  43-44                 Private,Self-emp-not-inc,State-gov   
 1838  47-52    Local-gov,Private,Self-emp-inc,Self-emp-not-inc   
 1839  61-75     Local-gov,Private,Self-emp-not-inc,Without-pay   
 1840  35-38  Federal-gov,Private,Self-emp-inc,Self-emp-not-inc   
 1841  50-52                     Private,Self-emp-inc,State-gov   
 
                                               education  \
 0                   10th,Bachelors,HS-grad,Some-college   
 1                     9th,Bachelors,HS-grad,Prof-school   
 2            

In [None]:
cluster_len_list

[256]

In [None]:
discernability_list

[13294.0]

In [None]:
avg_equi_class_list

[0.027795874049945712]

In [None]:
def model_evaluation1(model, metric):
    model_cv = cross_val_score(model, X_train1, y_train1, cv = StratifiedKFold(n_splits = 5), scoring = metric)
    return model_cv

In [None]:
df2.columns=['age','workclass','education','marital-status','occupation','relationship','race','gender','hours-per-week','native-country','income']

In [None]:
df2['income']

0       0
1       0
2       0
3       0
4       0
       ..
1837    0
1838    1
1839    0
1840    1
1841    0
Name: income, Length: 1842, dtype: object

In [None]:


transformer1 = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), ['workclass', 'education', 'marital-status',
                                                           'occupation', 'relationship', 'race', 'gender','native-country','age','hours-per-week'])
            ],  remainder = 'passthrough',sparse_threshold=0)


In [None]:
for e in df2.columns:
    df2[e]=df2[e].astype(str)

df2["income"] = pd.to_numeric(df2["income"])

In [None]:
df2.dtypes

age               object
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
gender            object
hours-per-week    object
native-country    object
income             int64
dtype: object

In [None]:
X1 = df2.drop('income', axis = 1)
y1 = df2['income']

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y1,stratify = y1,test_size = 0.3,random_state = 2222)

In [None]:
(X_test1)

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,gender,hours-per-week,native-country
1529,47-52,"Local-gov,Private,Self-emp-inc,Self-emp-not-inc","Assoc-acdm,Doctorate,HS-grad,Masters,Prof-scho...","Divorced,Married-civ-spouse,Never-married","Exec-managerial,Prof-specialty,Sales","Husband,Not-in-family,Wife",White,"Female,Male",60,"Honduras,United-States"
708,43-47,"Federal-gov,Private,Self-emp-inc,Self-emp-not-inc","Bachelors,Doctorate,HS-grad,Masters,Some-college","Divorced,Married-civ-spouse,Never-married","Exec-managerial,Machine-op-inspct,Prof-special...","Husband,Not-in-family,Unmarried","Asian-Pac-Islander,White","Female,Male",50,"Italy,South,United-States"
432,38-39,"Local-gov,Private,State-gov","10th,Bachelors,HS-grad,Some-college","Divorced,Married-civ-spouse,Never-married","Adm-clerical,Craft-repair,Handlers-cleaners,Ot...","Husband,Not-in-family,Own-child,Wife","Black,White","Female,Male",40,United-States
177,23-24,"Private,Self-emp-inc","Bachelors,HS-grad,Some-college","Married-civ-spouse,Never-married","Adm-clerical,Craft-repair,Machine-op-inspct,Ot...","Husband,Not-in-family,Own-child,Wife","Asian-Pac-Islander,White","Female,Male",40,"Laos,United-States"
1187,39-43,"Private,Self-emp-inc,Self-emp-not-inc","Assoc-voc,Doctorate,HS-grad,Prof-school,Some-c...","Married-civ-spouse,Never-married","Craft-repair,Exec-managerial,Farming-fishing,P...","Husband,Not-in-family",White,"Female,Male",60-65,United-States
...,...,...,...,...,...,...,...,...,...,...
1218,51-52,"Private,Self-emp-inc","1st-4th,Assoc-voc,HS-grad,Prof-school,Some-col...","Married-civ-spouse,Widowed","Adm-clerical,Exec-managerial,Machine-op-inspct...","Husband,Unmarried","Asian-Pac-Islander,White","Female,Male",40,"India,Mexico,United-States"
936,37-43,Private,"5th-6th,HS-grad,Masters,Some-college","Divorced,Married-civ-spouse,Never-married,Sepa...","Adm-clerical,Craft-repair,Farming-fishing,Mach...","Husband,Not-in-family,Own-child","Black,White","Female,Male",25-32,"Guatemala,Poland,United-States"
408,28-29,Private,"Assoc-voc,Bachelors,HS-grad","Married-civ-spouse,Married-spouse-absent,Never...","Adm-clerical,Craft-repair,Prof-specialty,Prote...","Husband,Not-in-family,Wife","Asian-Pac-Islander,Black,White","Female,Male",40,"Cuba,India,United-States"
150,28-29,Private,"10th,Assoc-voc,Bachelors,HS-grad,Some-college","Divorced,Married-civ-spouse,Never-married","Adm-clerical,Craft-repair,Exec-managerial,Mach...","Husband,Not-in-family,Own-child,Unmarried","Asian-Pac-Islander,Black,White","Female,Male",40-45,"England,United-States"


In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y1,stratify = y1,test_size = 0.3,random_state = 2222)
tree1 = DecisionTreeClassifier(random_state = 2222)
nb1 = naive_bayes.GaussianNB()
knn1 = KNeighborsClassifier()


knn_pipe1 = Pipeline([('transformer', transformer1), ('knn', knn1)])
tree_pipe1 = Pipeline([('transformer', transformer1), ('tree', tree1)])
nb_pipe1 = Pipeline([('transformer', transformer1), ('nb', nb1)])



nb_cv1= model_evaluation1(nb_pipe1, 'recall')
knn_pipe_cv1 = model_evaluation1(knn_pipe1, 'recall')
tree_pipe_cv1 = model_evaluation1(tree_pipe1, 'recall')

for model in [ tree_pipe1,nb_pipe1,knn_pipe1]:
    model.fit(X_train1, y_train1)



score_cv1 = [ tree_pipe_cv1.round(10),nb_cv1.round(10),knn_pipe_cv1.round(10)]
score_mean1 = [ tree_pipe_cv1.mean(),nb_cv1.mean(),knn_pipe_cv1.mean()]
score_std1 = [ tree_pipe_cv1.std(),nb_cv1.std(),knn_pipe_cv1.std()]
score_recall_score1 = [recall_score(y_test1, tree_pipe1.predict(X_test1),pos_label='positive',average='micro'),recall_score(y_test1, nb_pipe1.predict(X_test1),pos_label='positive',average='micro'),  recall_score(y_test1, knn_pipe1.predict(X_test1))]
method_name1 = [ 'Decision Tree Classifier', 'Naive Bayes Classifier','KNN Classifier']
cv_summary1 = pd.DataFrame({
    'method': method_name1,
    'cv score': score_cv1,
    'mean score': score_mean1,
    'std score': score_std1,
    'recall score': score_recall_score1,
    'recall score without test change': score_recall_score
})


cv_summary1

Unnamed: 0,method,cv score,mean score,std score,recall score,recall score without test change
0,Decision Tree Classifier,"[0.2424242424, 0.2424242424, 0.1940298507, 0.2...",0.225961,0.019587,0.710669,0.766727
1,Naive Bayes Classifier,"[0.7575757576, 0.8787878788, 0.7313432836, 0.7...",0.759159,0.065169,0.513562,0.464738
2,KNN Classifier,"[0.2878787879, 0.2727272727, 0.2537313433, 0.2...",0.268114,0.012084,0.314685,0.524476


In [None]:
y1.unique()

array([0, 1])