In [1]:
import numpy as np
import time
import os
import gendata
import torch
from torch.utils.data import TensorDataset
from scada.models.wdgrl import WDGRL
from scada.si_scada import run_scada, kmeans_withDA


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ns, nt = 2000, 200
d = 10
delta = 6

Generate train data

In [3]:
Xs, Xt, Ys, Yt, mus, mut, Sigma = gendata.random_3_clusters(dim=10, 
                                                                          delta=delta, 
                                                                          ns=ns//3, nt=nt//3, 
                                                                          cluster_std=np.array([0.25, 0.5, 1]),)


In [4]:
ns = Xs.shape[0]
nt = Xt.shape[0]

In [5]:
xs = torch.from_numpy(Xs).double()
xt = torch.from_numpy(Xt).double()

source_dataset = TensorDataset(xs)
target_dataset = TensorDataset(xt)

Train Representation Learning-based Domain Adaptation model

In [6]:
final_model = WDGRL(
    input_dim=d,
    encoder_hidden_dims=[300,100],
    critic_hidden_dims=[100],
    alpha1=0.0001,
    alpha2=0.00001,
)

In [7]:

log_dir = "saved_models/ex1/"
os.makedirs(log_dir, exist_ok=True)

In [8]:
log_loss = final_model.train(
    source_dataset,
    target_dataset,
    num_epochs=3000,
    gamma=5.0,
    dc_iter=6,
    batch_size=32,
    early_stopping=True,
    model_path=log_dir,
)
final_model.save_model(log_dir)
total_loss = log_loss["loss"]


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Epoch:  54%|█████▎    | 1606/3000 [00:38<02:47,  8.31it/s]

Encoder and Critic saved to saved_models/ex1//early_model


Epoch:  94%|█████████▎| 2805/3000 [01:13<00:05, 34.18it/s]

Encoder and Critic saved to saved_models/ex1//early_model


Epoch:  97%|█████████▋| 2905/3000 [01:15<00:02, 34.26it/s]

Encoder and Critic saved to saved_models/ex1//early_model


Epoch: 100%|██████████| 3000/3000 [01:17<00:00, 38.54it/s]

Encoder and Critic saved to saved_models/ex1/





In [9]:
ns = 100
nt = 50
d = 10

Generate test data

In [45]:
Xs_test, Xt_test, Ys, Yt, mus, mut, Sigma = gendata.random_3_clusters(dim=d, 
                                                                          delta=delta, 
                                                                          ns=ns//3, nt=nt//3, 
                                                                          cluster_std=np.array([0.25, 0.5, 1]),
                                                                          seed=None, return_Sigma=True)
ns = Xs_test.shape[0]
nt = Xt_test.shape[0]


d = Xs_test.shape[1]
n = ns + nt

k-means clustering with k = 3 

In [46]:
device = "cpu"
K = 3

In [47]:
from scada.utils.kmeans import kmeans

In [48]:
Xs_torch = torch.from_numpy(Xs_test).double().to(device)
Xt_torch = torch.from_numpy(Xt_test).double().to(device)
# print(Xt_torch.device)  
with torch.no_grad():
    xs_hat = final_model.extract_feature(Xs_torch).cpu().numpy()
    xt_hat = final_model.extract_feature(Xt_torch).cpu().numpy()

X_transformed = np.vstack((xs_hat, xt_hat))

initial_centroids_obs, labels_all_obs, members_all_obs = kmeans(X_transformed, K)

In [49]:
log_dir

'saved_models/ex1/'

Statistical Inference for k-means Clustering after Domain Adaptation (SCaDA)

In [50]:
c1, c2 = np.random.choice(K, 2, replace=False)

In [51]:
print("Statistical inference with SCADA for 2 clusters: ", c1, " and ", c2)

Statistical inference with SCADA for 2 clusters:  2  and  0


In [52]:
pvalue = run_scada(Xs = Xs_test, Xt=Xt_test, 
                   Sigma=Sigma, n_clusters=K, 
                   labels_all_obs= labels_all_obs, 
                   model_path=log_dir, hypothesis=(c1, c2))

Encoder and Critic loaded from saved_models/ex1/


Progress:   0%|          | 0/2913542 [00:00<?, ?it/s]

Progress: : 2919564it [00:15, 182902.50it/s]                           


In [53]:
print("p-value for clusters ", c1, " and ", c2, ": ", pvalue)

p-value for clusters  2  and  0 :  0.014932843195892609
