In [19]:
global ROOT_DIR
ROOT_DIR = '/gpfs/commons/groups/gursoy_lab/aelhussein/ot_cost/otcost_fl_rebase'
DATA_DIR = f'{ROOT_DIR}/data/Weather'

In [20]:
import pandas as pd
import numpy as np
import copy
import sys
sys.path.append(f'{ROOT_DIR}/code/helper/')
import OTCost as ot
import importlib
importlib.reload(ot)
from sklearn.preprocessing import StandardScaler
SEED = 1
np.random.seed(SEED)

In [3]:
def extractData(df, climates, n=5000):
    df = df[df['climate'].isin(climates)]
    ind = np.random.choice(df.shape[0], n)
    X = df.iloc[ind, 6:]
    y = df.iloc[ind, 3:6:2]
    return X.values, y.values

In [4]:
def dictionaryCreator(df, climates, n = 5000):
    ##wrangle to dictionary for OT cost calculation
    X1, y1 = extractData(df, climates[0],n = n)
    scaler = StandardScaler()
    X1_normalized = scaler.fit_transform(X1)   
    X2, y2 = extractData(df, climates[1],n = n)
    X2_normalized = scaler.transform(X2)  
   
    data, label = {"1": X1_normalized, "2": X2_normalized}, {"1": y1, "2": y2}
    return data, label

In [5]:
def addNoise(data, mean = 0, sigma = 1):
    n,k = data.shape[0], data.shape[1]
    noise = np.random.normal(mean, sigma, size = n*k).reshape(n,k)
    data_ = copy.deepcopy(data)
    data_ += noise
    return data_

In [6]:
def sampler(data, label,num = 2000):
    data_, label_  = {}, {}
    for i in data:
        idx = np.random.choice(np.arange(data[i].shape[0]), num, replace=False)
        data_[i] = data[i][idx]
        label_[i] = label[i][idx, 1]
    return data_, label_

In [7]:
def saveDataset(X,y, name):
    d1= np.concatenate((X, y[:,0].reshape(-1,1)), axis=1)
    np.savetxt(f'{DATA_DIR}/{name}.csv',d1)
    return

## load data

In [82]:
##load dataset
df = pd.read_csv(f'{DATA_DIR}/shifts_canonical_train.csv', nrows = 20000)
df[((df['climate'] == 'tropical') & (df['fact_temperature'] > 25)) | 
    ((df['climate'] == 'mild temperate') & ((df['fact_temperature'] > 10) & (df['fact_temperature'] < 25))) |
    (df['climate'] == 'dry') & ((df['fact_temperature'] > 5) & (df['fact_temperature'] < 25))]
df_snow = pd.read_csv(f'{DATA_DIR}/shifts_canonical_eval_out.csv', nrows = 20000)
df_snow = df_snow[df_snow['fact_temperature'] < 10]
df = pd.concat([df, df_snow])
df.dropna(inplace = True)

## OT cost (no labels as regression)

In [9]:
DATASET = 'Weather'

In [83]:
SAVE = False

In [93]:
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['tropical', 'mild temperate']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data,label)

Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

cost: 0.11


In [95]:
saveDataset(data['1'],label['1'], f'data_1_{cost}')
saveDataset(data['2'],label['2'], f'data_2_{cost}')

In [122]:
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['tropical', 'mild temperate']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data,label)

Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

if SAVE:
    saveDataset(data['1'],label['1'], f'data_1_{cost}')
    saveDataset(data['2'],label['2'], f'data_2_{cost}')

cost: 0.10


In [97]:
importlib.reload(ot)
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['dry', 'mild temperate']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data, label)

Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

cost: 0.19


In [98]:
saveDataset(data['1'],label['1'], f'data_1_{cost}')
saveDataset(data['2'],label['2'], f'data_2_{cost}')

In [121]:
importlib.reload(ot)
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['dry', 'mild temperate']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data, label)

Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

if SAVE:
    saveDataset(data['1'],label['1'], f'data_1_{cost}')
    saveDataset(data['2'],label['2'], f'data_2_{cost}')

cost: 0.19


In [100]:
importlib.reload(ot)
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['dry']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data, label)

Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

cost: 0.30


In [101]:
saveDataset(data['1'],label['1'], f'data_1_{cost}')
saveDataset(data['2'],label['2'], f'data_2_{cost}')

In [120]:
importlib.reload(ot)
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['dry']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data, label)

Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

if SAVE:
    saveDataset(data['1'],label['1'], f'data_1_{cost}')
    saveDataset(data['2'],label['2'], f'data_2_{cost}')

cost: 0.28


In [102]:
importlib.reload(ot)
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['dry', 'snow']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data, label)

Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

cost: 0.40


In [103]:
saveDataset(data['1'],label['1'], f'data_1_{cost}')
saveDataset(data['2'],label['2'], f'data_2_{cost}')

In [119]:
importlib.reload(ot)
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['dry', 'snow']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data, label)


Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

if SAVE:
    saveDataset(data['1'],label['1'], f'data_1_{cost}')
    saveDataset(data['2'],label['2'], f'data_2_{cost}')

cost: 0.38


In [108]:
importlib.reload(ot)
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['snow']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data, label)

Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))


cost: 0.48


In [109]:
saveDataset(data['1'],label['1'], f'data_1_{cost}')
saveDataset(data['2'],label['2'], f'data_2_{cost}')

In [46]:
importlib.reload(ot)
importlib.reload(ot)
climate1 = ['tropical', 'mild temperate']
climate2 = ['snow']
data, label = dictionaryCreator(df, [climate1, climate2])
data_, label_ = sampler(data, label)


Weather_OTCost_label = ot.OTCost(DATASET, data_, label_)
cost = Weather_OTCost_label.calculate_ot_cost()
cost = "{:.2f}".format(float(cost))

if SAVE:
    saveDataset(data['1'],label['1'], f'data_1_{cost}')
    saveDataset(data['2'],label['2'], f'data_2_{cost}')

cost: 0.49
