# **Import Python libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
sns.set_theme(style="whitegrid")
from scipy.stats import boxcox

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import raw dataset**

In [2]:
df = pd.read_csv("../input/adult-census-income/adult.csv")

# **Impute missing values with mode**

In [None]:
df[df == '?'] = np.nan
for col in ['workclass', 'occupation']:
    df[col].fillna(df[col].mode()[0], inplace=True)

# **Encode categorical variables**

In [3]:
df.loc[df["education"] == "Preschool", "education"] = 0
first = ["1st-4th", "5th-6th"]
for s in first:
    df.loc[df["education"] == s, "education"] = 1
second = ["7th-8th", "9th", "10th", "11th", "12th"]
for s in second:
    df.loc[df["education"] == s, "education"] = 2
third = ["HS-grad", "Some-college"]
for s in third:
    df.loc[df["education"] == s, "education"] = 3
fourth = ["Assoc-voc", "Assoc-acdm", "Bachelors"]
for s in fourth:
    df.loc[df["education"] == s, "education"] = 4
fifth = ["Masters", "Doctorate", "Prof-school"]
for s in fifth:
    df.loc[df["education"] == s, "education"] = 5

In [4]:
a = df["race"].unique()
for s in list(a):
    if s != "White":
        df.loc[df["race"] == s, "race"] = "Not white"
df.loc[df["race"] == "White", "race"] = 0
df.loc[df["race"] == "Not white", "race"] = 1

In [5]:
b = df["native.country"].unique()
for s in list(b):
    if s != "United-States":
        df.loc[df["native.country"] == s, "native.country"] = "Not US"
df.loc[df["native.country"] == "United-States", "native.country"] = 0
df.loc[df["native.country"] == "Not US", "native.country"] = 1

In [8]:
df.loc[df["sex"] == "Male", "sex"] = 0
df.loc[df["sex"] == "Female", "sex"] = 1
df.loc[df["income"] == "<=50K", "income"] = -1
df.loc[df["income"] == ">50K", "income"] = 1

In [9]:
df = pd.get_dummies(df, columns = ["workclass", "occupation", "relationship", "marital.status"])

In [10]:
df["education"] = pd.to_numeric(df["education"])
df["race"] = pd.to_numeric(df["race"])
df["sex"] = pd.to_numeric(df["sex"])
df["native.country"] = pd.to_numeric(df["native.country"])
df["income"] = pd.to_numeric(df["income"])

# **Split data into separate training and test set**

In [12]:
from sklearn.model_selection import train_test_split
y = df["income"].copy()
X = df.drop(labels='income', axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# **Feature Scaling**

In [None]:
t = df.columns
t = list(t)
t.remove('race')
t.remove('sex')
t.remove('income')
t.remove('native.country')

In [None]:
X_train_temp = X_train.copy()
X_test_temp = X_test.copy()

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train[t])
X_train_temp[t] = pd.DataFrame(scaler.transform(X_train[t]), index=X_train.index, columns=t)
X_test_temp[t] = pd.DataFrame(scaler.transform(X_test[t]), index=X_test.index, columns=t)


In [15]:
X_train = X_train_temp.copy()
X_test = X_test_temp.copy()

# **Convert dataframe into numpy array**

In [17]:
z1_train = X_train["sex"].to_numpy().copy()
z2_train = X_train["race"].to_numpy().copy()
z3_train = X_train["native.country"].to_numpy().copy()
z1_test = X_test["sex"].to_numpy().copy()
z2_test = X_test["race"].to_numpy().copy()
z3_test = X_test["native.country"].to_numpy().copy()

In [18]:
X_train = X_train.to_numpy().copy()
X_test = X_test.to_numpy().copy()
y_train = y_train.to_numpy().copy()
y_test = y_test.to_numpy().copy()

# **Group-target poisoning with a given rate**

In [19]:
rate = 0.1
nums = int(len(y_train)*rate)

In [20]:
import random
def mess_up(z, y):
    y = y.copy()
    lst_1 = list()
    for i in range(len(z)):
        if z[i] == 1:
            lst_1.append(i)
    print(nums, len(lst_1))
    choose = random.sample(lst_1, nums)
    y = y.copy()
    for e in choose:
        y[e] = -y[e]
    return y
def mess_up2(z1, z2, y):
    y = y.copy()
    nums1 = nums//2
    nums2 = nums - nums1
    lst_1 = list()
    for i in range(len(z1)):
        if z1[i] == 1:
            lst_1.append(i)
    lst_2 = list()
    for i in range(len(z2)):
        if z2[i] == 1:
            lst_2.append(i)
    choose1 = random.sample(lst_1, nums1)
    for e in choose1:
        try:
            lst_2.remove(e)
        except:
            continue
    choose2 = random.sample(lst_2, nums2)
    choose = choose1 + choose2
    for e in choose:
        y[e] = -y[e]
    return y
def mess_up3(z1, z2, z3, y):
    nums1 = nums//3
    nums2 = nums//3
    nums3 = nums - nums1 - nums2
    y = y.copy()
    lst_1 = list()
    for i in range(len(z1)):
        if z1[i] == 1:
            lst_1.append(i)
    lst_2 = list()
    for i in range(len(z2)):
        if z2[i] == 1:
            lst_2.append(i)
    lst_3 = list()
    for i in range(len(z3)):
        if z3[i] == 1:
            lst_3.append(i)
    choose1 = random.sample(lst_1, nums1)
    for e in choose1:
        try:
            lst_2.remove(e)
        except:
            continue
    choose2 = random.sample(lst_2, nums2)
    choose = choose1 + choose2
    for e in choose:
        try:
            lst_3.remove(e)
        except:
            continue
    choose3 = random.sample(lst_3, nums3)
    choose += choose3
    for e in choose:
        y[e] = -y[e]
    return y

In [21]:
y_noisy_z1 = mess_up(z1_train,y_train)
y_noisy_z2 = mess_up(z2_train,y_train)
y_noisy_z3 = mess_up(z3_train,y_train)

In [22]:
y_noisy_z12 = mess_up2(z1_train, z2_train, y_train)

In [23]:
y_noisy_z123 =  mess_up3(z1_train, z2_train ,z3_train,y_train)

# **Save engineered data**

In [24]:
print(type(y_noisy_z1), y_noisy_z1.shape)
print(type(y_noisy_z1), y_noisy_z2.shape)
print(type(y_noisy_z1), y_noisy_z3.shape)
print(type(y_noisy_z12), y_noisy_z12.shape)
print(type(y_noisy_z123), y_noisy_z123.shape)
print(type(X_train), X_train.shape)
print(type(X_test), X_test.shape)
print(type(y_train), y_train.shape)
print(type(y_test), y_test.shape)
print(type(z1_train), z1_train.shape)
print(type(z2_train), z2_train.shape)
print(type(z3_train), z3_train.shape)
print(type(z1_test), z1_test.shape)
print(type(z2_test), z2_test.shape)
print(type(z3_test), z3_test.shape)

In [25]:
np.save("X_train.npy",X_train)
np.save("X_test.npy",X_test)
np.save("y_train.npy",y_train)
np.save("y_test.npy",y_test)
np.save("z1_train.npy",z1_train)
np.save("z2_train.npy",z2_train)
np.save("z3_train.npy",z3_train)
np.save("z1_test.npy",z1_test)
np.save("z2_test.npy",z2_test)
np.save("z3_test.npy",z3_test)
np.save("y_noisy_z1.npy",y_noisy_z1)
np.save("y_noisy_z2.npy",y_noisy_z2)
np.save("y_noisy_z3.npy",y_noisy_z3)
np.save("y_noisy_z12.npy",y_noisy_z12)
np.save("y_noisy_z123.npy",y_noisy_z123)