__This notebook has been made to make the datasets. Please keep in mind that:__

> This is only applicable if the provided raw original datasets are used (german, compas and drug).

> Only run this once (or not), since we already included the recreated datasets. It gives insight in how the datasets are created.

In [249]:
import os 
import pandas as pd
import numpy as np
import glob
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

########################### GERMAN DATASET ###########################
def recreate_german_dataset():
    file_path = os.path.join(".", "resources", "german.data")
    data = pd.read_csv(file_path, delim_whitespace=True, header=None)
    
    targets = data[data.columns[-1]] # TARGET labels
    data = data.drop(20, axis=1) # drop targets before rescaling

    # had to change the targets since the targets were [1,2]
    targets = targets.replace({1:0, 2:1})
    targets = pd.DataFrame(targets).rename(columns={targets.name:"targets"})


    """
    Attribute 9 (in our dataset attribute and index 8, since we start at 0, which later becomes idx 0):
    Personal status and sex 
    A91 : male : divorced/separated 
    A92 : female : divorced/separated/married 
    A93 : male : single 
    A94 : male : married/widowed 
    A95 : female : single 
    """

    ## Sex attribute binary
    data[8] = data[8].replace({"A91": 0, "A92": 1, "A93": 0, "A94": 0, "A95":1})
    group_labels = data[8]
    group_labels = pd.DataFrame(group_labels).rename(columns={group_labels.name:"group_labels"})

    ## Sensitive feature is sex - attribute 8, make that now index 0
    sensitive_feature_idx = data.pop(8)
    data.insert(0, 8, sensitive_feature_idx)
    data = data.rename(columns={i:j for i,j in zip(data.columns, range(13))})

    # One-hot encode all categorical variables
    str_columns = []
    not_str = []
    for i in data.columns:
        if type(data[i][0]) == str:
            str_columns.append(i)
        else:
            not_str.append(i)

    # Add one-hot encoded data to the data
    dummies = pd.get_dummies(data[str_columns])
    data = pd.concat([data[not_str], dummies], axis=1, join='inner') 

    # First rescale to mean = 0 and std = 1, before adding targets to df (otherwise targets would be rescaled as well)
    for i in data.columns:
        data[i] = preprocessing.scale(data[i])

    # Add targets and group labels to df
    dataset = pd.concat([data, targets, group_labels], axis=1, join='inner')

    # Thereafter reshuffle whole dataframe 
    dataset = dataset.sample(frac=1, random_state=2).reset_index(drop=True)

    # Split dataframe in 80-20%
    train, test = train_test_split(dataset, test_size=0.2, random_state=42)
    
    # Make variable with grouplabels 
    group_label_train = np.array([i[0] for i in train.loc[:, "group_labels":].to_numpy()])
    group_label_test = np.array([i[0] for i in test.loc[:, "group_labels":].to_numpy()])
    group_label = np.concatenate((group_label_train, group_label_test))
    
    # Drop the grouplabels from the train and test
    train = train.drop(train.columns[-1], axis=1)
    test = test.drop(test.columns[-1], axis=1)
    

    # At last make x and y
    X_train = train.iloc[:, :-1].to_numpy() # exclude targets
    X_test = test.iloc[:, :-1].to_numpy()

    y_train = train.iloc[:, -1:].to_numpy() # targets only
    y_train = np.array([i[0] for i in y_train])

    y_test = test.iloc[:, -1:].to_numpy() # targets only
    y_test = np.array([i[0] for i in y_test])

    np.savez(os.path.join("data.npz"), X_train=X_train, Y_train=y_train, X_test=X_test, Y_test=y_test)
    np.savez(os.path.join("german_group_label.npz"), group_label=group_label)
######################################################################


########################### COMPAS DATASET ###########################
def recreate_compas_dataset():

    data = pd.read_csv(os.path.join(".", "resources", "compas-scores-two-years.csv"))
    targets = data[data.columns[-1]]
    targets = pd.DataFrame(targets).rename(columns={targets.name:"targets"})

    # Used columns as specified in the paper
    used_cols = ["sex", "juv_fel_count", "priors_count", "race", "age_cat", 
                "juv_misd_count", "c_charge_degree", "juv_other_count"]

    data = data[used_cols]
    # Manually change the values male to 0 and female to 1
    data["sex"] = data["sex"].replace({"Male":0, "Female":1})
    group_labels = data["sex"]
    group_labels = pd.DataFrame(group_labels).rename(columns={group_labels.name:"group_labels"})
    
    # One-hot encode and add to data
    str_columns = [i for i in data.columns if type(data[i][0]) == str]
    not_str = [i for i in data.columns if type(data[i][0]) != str]
    dummies = pd.get_dummies(data[str_columns])
    data = pd.concat([data[not_str], dummies], axis=1, join='inner') 

    # First rescale to mean = 0 and std = 1, before adding targets to df (otherwise targets would be rescaled as well)
    for i in data.columns:
        data[i] = preprocessing.scale(data[i])


    dataset = pd.concat([data, targets, group_labels], axis=1, join='inner')

    # Thereafter reshuffle whole dataframe 
    dataset = dataset.sample(frac=1, random_state=2).reset_index(drop=True)

    # Split dataframe in 80-20%
    train, test = train_test_split(dataset, test_size=0.2, random_state=42)
    
    # Make variable with grouplabels
    group_label_train = np.array([i[0] for i in train.loc[:, "group_labels":].to_numpy()])
    group_label_test = np.array([i[0] for i in test.loc[:, "group_labels":].to_numpy()])
    group_label = np.concatenate((group_label_train, group_label_test))
    
    # Drop the grouplabels from the train and test
    train = train.drop(train.columns[-1], axis=1)
    test = test.drop(test.columns[-1], axis=1)

    # At last make x and y
    X_train = train.iloc[:, :-1].to_numpy() # exclude targets
    X_test = test.iloc[:, :-1].to_numpy()

    y_train = train.iloc[:, -1:].to_numpy() # targets only
    y_train = np.array([i[0] for i in y_train])

    y_test = test.iloc[:, -1:].to_numpy() # targets only
    y_test = np.array([i[0] for i in y_test])

    np.savez(os.path.join("compas_data.npz"), X_train=X_train, Y_train=y_train, X_test=X_test, Y_test=y_test)
    np.savez(os.path.join("compas_group_label.npz"), group_label=group_label)
######################################################################


########################### DRUG DATASET ###########################
def recreate_drug_dataset():
    
    file_path = os.path.join(".", "resources", "drug_consumption.data")
    data = pd.read_csv(file_path, delimiter=",", header=None)

    # Targets. In the real dataset it is attribute 21 (python goes from 0, thus 20 in our case).
    targets = data.iloc[:, 20] 
    
    # They only take the first 13 attributes. See below the column specifications.
    data = data.iloc[:, :13] 
    
    ## Sensitive feature is gender - attribute 3, make that now index 0
    sensitive_feature_idx = data.pop(2)
    data.insert(0, 2, sensitive_feature_idx)
    data = data.rename(columns={i:j for i,j in zip(data.columns, range(13))})
    
    """
    Our column specifications 
    0 = Gender, 1 = ID, 2 = Age, 3 = Education, 4 = Country, 5 = Ethinicity, 6 = NScore, 7 = EScore,
    8 = OScore, 9 = AScore, 10 = CScore, 11 = Impulsiveness, 12 = SS, 20 = TARGET
    """
    
    """
    Problem which can be solved:
    * Seven class classifications for each drug separately.
    * Problem can be transformed to binary classification by union of part of classes into one new class. 
    For example, "Never Used", "Used over a Decade Ago" form class "Non-user" and 
    all other classes form class "User".
    """

    """
    CL0 Never Used
    CL1 Used over a Decade Ago 
    CL2 Used in Last Decade 
    CL3 Used in Last Year 
    CL4 Used in Last Month 
    CL5 Used in Last Week 
    CL6 Used in Last Day 
    """

    ## had to change the targets since the targets were not binary
    targets = targets.replace({"CL0":0, "CL1":1, "CL2":1, "CL3":1, "CL4":1, "CL5":1, "CL6":1})
    targets = pd.DataFrame(targets).rename(columns={targets.name:"targets"})
    
    # make group labels
    group_labels = {i:(1 if data[0][i] > 0 else 0) for i in data[0].index }
    group_labels = pd.DataFrame.from_dict(group_labels, orient='index', columns={"group_labels"})
    
    # First rescale to mean = 0 and std = 1, before adding targets to df (otherwise targets would be rescaled as well)
    for i in data.columns:
        data[i] = preprocessing.scale(data[i])

    dataset = pd.concat([data, targets, group_labels], axis=1, join='inner') 

    # Thereafter reshuffle whole dataframe 
    dataset = dataset.sample(frac=1, random_state=2).reset_index(drop=True)

    # Split dataframe in 80-20%
    train, test = train_test_split(dataset, test_size=0.2, random_state=42)
    
    # Make variable with grouplabels
    group_label_train = np.array([i[0] for i in train.loc[:, "group_labels":].to_numpy()])
    group_label_test = np.array([i[0] for i in test.loc[:, "group_labels":].to_numpy()])
    group_label = np.concatenate((group_label_train, group_label_test))
    
    # Drop the grouplabels from the train and test
    train = train.drop(train.columns[-1], axis=1)
    test = test.drop(test.columns[-1], axis=1)

    # At last make x and y
    X_train = train.iloc[:, :-1].to_numpy() # exclude targets
    X_test = test.iloc[:, :-1].to_numpy()

    y_train = train.iloc[:, -1:].to_numpy() # targets only
    y_train = np.array([i[0] for i in y_train])

    y_test = test.iloc[:, -1:].to_numpy() # targets only
    y_test = np.array([i[0] for i in y_test])

    # Just a check
    # print(len(X_train), len(X_test), len(y_train), len(y_test), len(group_label) == len(y_train) + len(y_test))

    np.savez(os.path.join("drug2_data.npz"), X_train=X_train, Y_train=y_train, X_test=X_test, Y_test=y_test)
    np.savez(os.path.join("drug2_group_label.npz"), group_label=group_label)
######################################################################

In [250]:
# Do not run this, since this will overwrite our datasets 

# recreate_german_dataset()
# recreate_compas_dataset()
# recreate_drug_dataset()