In [1]:
global ROOT_DIR
ROOT_DIR = '/gpfs/commons/groups/gursoy_lab/aelhussein/ot_cost/otcost_fl_rebase'
DATA_DIR = f'{ROOT_DIR}/data/Credit'

In [329]:
import pandas as pd
import numpy as np
import copy
import sys
sys.path.append(f'{ROOT_DIR}/code/helper/')
import OTCost as ot
import importlib
importlib.reload(ot)
import random
SEED = 1234
np.random.seed(SEED)
random.seed(SEED)

In [455]:
## create overall test set taking random sample of dataset
def fracData(data, share, share_pos = 0.35):
    ## size of dataset
    num = int(data.shape[0] * share)
    ## share of pos and neg
    pos = int(share_pos * num)
    neg = int((1-share_pos) * num)
    df = pd.concat([data.groupby('Class').get_group(0).sample(n = neg, random_state = SEED),
             data.groupby('Class').get_group(1).sample(n = pos, random_state = SEED)])
    return df.sample(frac = 1)

In [331]:
def splitDataCredit(data, frac_pos, frac_neg):
    df_1 = pd.concat([data.groupby('Class').get_group(0).sample(frac = frac_neg, random_state = SEED),
             data.groupby('Class').get_group(1).sample(frac = frac_pos, random_state = SEED)])
    df_2 = data.loc[~data.index.isin(df_1.index)]
    return df_1.sample(frac = 1), df_2.sample(frac = 1)


In [332]:
def splitLabel(df):
##split into features and labels
    X = df.iloc[:,1:29]
    y = df.iloc[:,-1]
    return X.values, y.values.reshape(-1)

In [333]:
def dictionaryCreater(d1, d2):
    ##wrangle to dictionary for OT cost calculation
    X1, y1 = splitLabel(d1)
    X2, y2 = splitLabel(d2)
    data, label = {"1": X1, "2": X2}, {"1": y1.reshape(1,-1)[0], "2": y2.reshape(1,-1)[0]}
    return data, label

In [466]:
def sampler(data, label, num = 2000):
    data_, label_  = {}, {}
    for i in data:
        idx = np.random.choice(np.arange(data[i].shape[0]), num, replace=False)
        data_[i] = data[i][idx]
        label_[i] = label[i][idx]
    return data_, label_

In [335]:
def addNoise(data, mean = 0, sigma = 1):
    k = data.shape[1]
    n = data.shape[0]
    noise = np.random.normal(mean, sigma, size = n*k).reshape(n,k)
    data_ = copy.deepcopy(data)
    data_.iloc[:,1:30] += noise[:, 1:30]
    return data_


In [336]:
def saveDataset(X,y, name):
    d1= np.concatenate((X, y.reshape(-1,1)), axis=1)
    np.savetxt(f'{DATA_DIR}/{name}.csv',d1)
    return

## Load data

In [337]:
##load dataset
df = pd.read_csv(f'{DATA_DIR}/creditcard.csv')

In [456]:
df_ = fracData(df, 0.0045)

## OT cost

In [533]:
private = False
DATASET = 'Credit'
SAVE = False

In [563]:
importlib.reload(ot)
frac_pos, frac_neg = 0.5, 0.5
d1, d2 = splitDataCredit(df_, frac_pos, frac_neg)
data, label = dictionaryCreater(d1, d2)
data_, label_ = sampler(data, label)

Credit_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Credit_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

if SAVE:
    saveDataset(data['1'],label['1'], f'data_1_{cost}')
    saveDataset(data['2'],label['2'], f'data_2_{cost}')

cost: 0.13


In [549]:
importlib.reload(ot)
bias = 0.3
frac_pos, frac_neg = 0.5*(1+bias), 0.5*(1-bias)
d1, d2 = splitDataCredit(df_, frac_pos, frac_neg)
data,  label = dictionaryCreater(d1, d2)
data_, label_ = sampler(data, label)

Credit_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Credit_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))


if SAVE:
    saveDataset(data['1'],label['1'], f'data_1_{cost}')
    saveDataset(data['2'],label['2'], f'data_2_{cost}')

cost: 0.20


In [560]:
importlib.reload(ot)
bias = 0.55
frac_pos, frac_neg = 0.5*(1+bias), 0.5*(1-bias)
d1, d2 = splitDataCredit(df_, frac_pos, frac_neg)
d2 = addNoise(d2, mean = 0, sigma = 1)
data,  label = dictionaryCreater(d1, d2)
data_, label_ = sampler(data, label)

Credit_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Credit_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))


if SAVE:
    saveDataset(data['1'],label['1'], f'data_1_{cost}')
    saveDataset(data['2'],label['2'], f'data_2_{cost}')

cost: 0.30


In [554]:
importlib.reload(ot)
bias = 0.89
frac_pos, frac_neg = 0.5*(1+bias), 0.5*(1-bias)
d1, d2 = splitDataCredit(df_, frac_pos, frac_neg)
d2 = addNoise(d2, mean = 0, sigma = 2)
data,  label = dictionaryCreater(d1, d2)
data_, label_ = sampler(data, label)

Credit_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Credit_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))


if SAVE:
    saveDataset(data['1'],label['1'], f'data_1_{cost}')
    saveDataset(data['2'],label['2'], f'data_2_{cost}')

cost: 0.40
