In [None]:
import numpy as np
from csPCR_functions import *
from Benchmark_functions import *
import warnings
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
'''
First, we generate a dataset, we can change the Alpha_s, Alpha_t and effect parameter to change the distribution of
the generated dataset.
'''

import numpy as np

def generate_binary(ns, nt, p, q, s, t, u, Alpha_s=1, Alpha_t=0, effect=1, x_effect=0, z_diff=0.1, threshold_X=0.5, threshold_Y=0.5):
    # Generate normal distributions for Z
    Zs_null = np.random.normal(0, 1, (ns, q))
    Zt_null = np.random.normal(0, 1, (nt, q))
    
    # Generate Z variables with and without shift
    Z_source = np.hstack((np.random.normal(0, 1, (ns, p)), Zs_null))
    Z_target = np.hstack((np.random.normal(z_diff, 1, (nt, p)), Zt_null))
    
    # Generate X variables
    X_source = Z_source[:, :p] @ u + np.random.normal(0, 1, ns)
    X_target = Z_target[:, :p] @ u + np.random.normal(0, 1, nt)
    
    # Convert X to binary
    X_source = (np.random.rand(ns) < 1 / (1 + np.exp(-X_source))).astype(int)
    X_target = (np.random.rand(nt) < 1 / (1 + np.exp(-X_target))).astype(int)
    
    # Generate V variables
    V_source = Z_source[:, :p] @ s + Alpha_s * X_source + np.random.normal(0, 5, ns)
    V_target = Z_target[:, :p] @ t + Alpha_t * X_target + np.random.normal(0, 5, nt)
    
    # Generate Y variables
    Y_source = (Z_source[:, :p].sum(axis=1))**2 + effect * V_source + np.random.normal(0, 1, ns) + x_effect * X_source
    Y_target = (Z_target[:, :p].sum(axis=1))**2 + effect * V_target + np.random.normal(0, 1, nt) + x_effect * X_target
    
    # Convert Y to binary
    Y_source = (np.random.rand(ns) < 1 / (1 + np.exp(-Y_source))).astype(int)
    Y_target = (np.random.rand(nt) < 1 / (1 + np.exp(-Y_target))).astype(int)
    
    return Y_source.reshape(-1, 1), X_source.reshape(-1, 1), V_source.reshape(-1, 1), Z_source,\
           Y_target.reshape(-1, 1), X_target.reshape(-1, 1), V_target.reshape(-1, 1), Z_target



def generate(ns, nt, p,q, s, t, u, Alpha_s=0, Alpha_t = 2,effect=1,x_effect = 0, non_lin = 0, z_diff = 0.1):
    Zs_null = np.random.normal(0,0.1, (ns, q))
    Zt_null = np.random.normal(0,0.1, (nt, q))
    
    Z_source = np.hstack((np.random.normal(0, 1, (ns, p)) , Zs_null))
    Z_target = np.hstack((np.random.normal(z_diff, 1, (nt, p)) , Zt_null))
    
    X_source = Z_source[:, :p] @ u + np.random.normal(0, 1, ns)
    X_target = Z_target[:, :p] @ u + np.random.normal(0, 1, nt)

    V_source = Z_source[:, :p] @ s + (1-non_lin)*Alpha_s * X_source + non_lin * Alpha_s * np.sin(X_source) + np.random.normal(0, 5, ns)
    V_target = Z_target[:, :p] @ t + (1-non_lin)*Alpha_t * X_target + non_lin * Alpha_t * np.sin(X_target) + np.random.normal(0, 5, nt)
    
    # V_source = Z_source[:, :p] @ s + 2*X_source 
    # V_target = Z_target[:, :p] @ t - 2*X_target
    
    Y_source = (Z_source[:, :p].sum(axis=1))**2 + effect*V_source + x_effect*X_source + np.random.normal(0, 1, ns) 
    Y_target = (Z_target[:, :p].sum(axis=1))**2 + effect*V_target + x_effect*X_target +np.random.normal(0, 1, nt) 
    
    
    return Y_source.reshape(-1, 1), X_source.reshape(-1, 1), V_source.reshape(-1, 1), Z_source,\
           Y_target.reshape(-1, 1), X_target.reshape(-1, 1), V_target.reshape(-1, 1), Z_target

In [3]:
#Set parameter for the generation of data
ns, p,q = 1000, 5, 50
nt = 2000

s = np.array([-1, -0.5, 0, 1, 1.5])
t = np.array([ 1, -1, 0.5 , -0.5, -1])
u = np.array([ 0, -1, 0.5, -0.5, 1])

Y_source, X_source, V_source, Z_source,Y_target, X_target, V_target, Z_target = \
generate_binary(ns,nt, p,q, s, t, u, Alpha_s=1, Alpha_t = 0,effect=1, z_diff = 0.1)
            

In [4]:
# Data Separation:
# # The data arrays X_e, Z_e, V_e are designated for density ratio estimation.
# # The arrays Z_source, X_source, V_source, Y_source are used for testing.
# # Here, we split the source data based on a specified proportion.

# proportion = 0.5
# num = int(proportion * X_source.shape[0])
# Z_e = Z_source[:num]
# X_e = X_source[:num]
# V_e = V_source[:num]
# Z_source = Z_source[num+1:]
# X_source = X_source[num+1:]
# V_source = V_source[num+1:]
# Y_source = Y_source[num+1:]
# Y_shuffle = Y_source


# Real data experiments to run

In [5]:
# Data Separation:
# The data arrays X_e, Z_e, V_e are designated for density ratio estimation.
# The arrays Z_source, X_source, V_source, Y_source are used for testing.
# Here, we split the source data based on a specified proportion.

proportion = 0.5
num = int(proportion * X_source.shape[0])
Z_e = Z_source[:num]
X_e = X_source[:num]
V_e = V_source[:num]
Z_source = Z_source[num+1:]
X_source = X_source[num+1:]
V_source = V_source[num+1:]
Y_source = Y_source[num+1:]
Y_shuffle = Y_source

## Check the Type-I error control with random shuffle Y

In [6]:
count = 0
for _ in tqdm(range(100), desc='Shuffling Y'):
    np.random.shuffle(Y_shuffle)
    p_value = Test(X_e, Z_e, V_e, X_source, Z_source, V_source, Y_source, \
     X_target, Z_target, V_target, L=3, K=20, datatype='binary')
    print(p_value)
    if p_value < 0.05:
        count += 1
print(f'Simulated Type-I error rate for csPCR is {count/100}')
    
    

Shuffling Y:   1%|          | 1/100 [00:05<09:03,  5.49s/it]

weight distribution:[147.82083076 198.88683355 152.29233569], test statistic:9.616790034645762
Max density ratio:7.910513579154344
X model coef:[[-0.0514032  -0.9353809   0.42996786 -0.45622697  0.94892472  0.04966199
  -0.04342069  0.07328284 -0.00300043 -0.0500641   0.00990582  0.03391019
   0.00820105  0.00119926 -0.02915687  0.00715326  0.0615861   0.06306997
  -0.08851392 -0.04639393 -0.00621893 -0.02607277 -0.08627543  0.00433713
   0.02898531  0.0405801   0.13655379  0.0149951  -0.00817237 -0.03681633
   0.03633029  0.01305593 -0.11428264 -0.00804348 -0.04357021 -0.04462247
   0.05084868 -0.05723329 -0.07386105  0.01113687  0.05813658  0.02751246
  -0.03783302 -0.07357355 -0.00945742  0.04549016 -0.06542091  0.03985588
  -0.05113283 -0.07067943  0.03416644 -0.03603247 -0.07138034 -0.09013238
  -0.07004714]]
covariance matrix[[ 0.95011504 -0.33333333 -0.33333333]
 [-0.33333333  1.7684016  -0.33333333]
 [-0.33333333 -0.33333333  0.98719433]]
0.06360285479644634


Shuffling Y:   2%|▏         | 2/100 [00:10<08:57,  5.48s/it]

weight distribution:[186.57783569 147.04687721 165.37528709], test statistic:4.705762231965645
Max density ratio:8.270649598110472
X model coef:[[-0.0514032  -0.9353809   0.42996786 -0.45622697  0.94892472  0.04966199
  -0.04342069  0.07328284 -0.00300043 -0.0500641   0.00990582  0.03391019
   0.00820105  0.00119926 -0.02915687  0.00715326  0.0615861   0.06306997
  -0.08851392 -0.04639393 -0.00621893 -0.02607277 -0.08627543  0.00433713
   0.02898531  0.0405801   0.13655379  0.0149951  -0.00817237 -0.03681633
   0.03633029  0.01305593 -0.11428264 -0.00804348 -0.04357021 -0.04462247
   0.05084868 -0.05723329 -0.07386105  0.01113687  0.05813658  0.02751246
  -0.03783302 -0.07357355 -0.00945742  0.04549016 -0.06542091  0.03985588
  -0.05113283 -0.07067943  0.03416644 -0.03603247 -0.07138034 -0.09013238
  -0.07004714]]
covariance matrix[[ 0.9489675  -0.33333333 -0.33333333]
 [-0.33333333  1.85701984 -0.33333333]
 [-0.33333333 -0.33333333  0.92260727]]
0.27392587083312137


Shuffling Y:   3%|▎         | 3/100 [00:16<08:50,  5.46s/it]

weight distribution:[169.26030092 171.74731551 157.99238357], test statistic:0.6459906921285017
Max density ratio:7.86282992659381
X model coef:[[-0.0514032  -0.9353809   0.42996786 -0.45622697  0.94892472  0.04966199
  -0.04342069  0.07328284 -0.00300043 -0.0500641   0.00990582  0.03391019
   0.00820105  0.00119926 -0.02915687  0.00715326  0.0615861   0.06306997
  -0.08851392 -0.04639393 -0.00621893 -0.02607277 -0.08627543  0.00433713
   0.02898531  0.0405801   0.13655379  0.0149951  -0.00817237 -0.03681633
   0.03633029  0.01305593 -0.11428264 -0.00804348 -0.04357021 -0.04462247
   0.05084868 -0.05723329 -0.07386105  0.01113687  0.05813658  0.02751246
  -0.03783302 -0.07357355 -0.00945742  0.04549016 -0.06542091  0.03985588
  -0.05113283 -0.07067943  0.03416644 -0.03603247 -0.07138034 -0.09013238
  -0.07004714]]
covariance matrix[[ 1.50670753 -0.33333333 -0.33333333]
 [-0.33333333  1.35760772 -0.33333333]
 [-0.33333333 -0.33333333  0.82228383]]
0.9074999119509516


Shuffling Y:   4%|▍         | 4/100 [00:21<08:43,  5.46s/it]

weight distribution:[179.51000586 175.35083062 144.13916353], test statistic:4.494115006485091
Max density ratio:7.625102944616324
X model coef:[[-0.0514032  -0.9353809   0.42996786 -0.45622697  0.94892472  0.04966199
  -0.04342069  0.07328284 -0.00300043 -0.0500641   0.00990582  0.03391019
   0.00820105  0.00119926 -0.02915687  0.00715326  0.0615861   0.06306997
  -0.08851392 -0.04639393 -0.00621893 -0.02607277 -0.08627543  0.00433713
   0.02898531  0.0405801   0.13655379  0.0149951  -0.00817237 -0.03681633
   0.03633029  0.01305593 -0.11428264 -0.00804348 -0.04357021 -0.04462247
   0.05084868 -0.05723329 -0.07386105  0.01113687  0.05813658  0.02751246
  -0.03783302 -0.07357355 -0.00945742  0.04549016 -0.06542091  0.03985588
  -0.05113283 -0.07067943  0.03416644 -0.03603247 -0.07138034 -0.09013238
  -0.07004714]]
covariance matrix[[ 1.07545818 -0.33333333 -0.33333333]
 [-0.33333333  1.36569073 -0.33333333]
 [-0.33333333 -0.33333333  1.23493772]]
0.29210203811418256


Shuffling Y:   4%|▍         | 4/100 [00:27<10:53,  6.81s/it]


KeyboardInterrupt: 

In [7]:
count = 0
for _ in tqdm(range(100), desc='Shuffling Y'):
    np.random.shuffle(Y_shuffle)
    p_value = Test_pe(X_e, Z_e, V_e, X_source, Z_source, V_source, Y_shuffle, \
     X_target, Z_target, V_target, L=3, K=20, datatype='continuous')
    print(p_value)
    if p_value < 0.05:
        count += 1
print(f'Simulated Type-I error rate for csPCR(pe) is {count/100}')
    

Shuffling Y:   0%|          | 0/100 [00:00<?, ?it/s]


ValueError: Unknown label type: 'continuous'

In [10]:
count = 0
for _ in tqdm(range(100), desc='Shuffling Y'):
    np.random.shuffle(Y_shuffle)
    p_value = IS_test(X_e, Z_e, V_e, X_source, Z_source, V_source, Y_shuffle,\
        X_target, Z_target, V_target, L=3, K=20, datatype='binary')
    if p_value < 0.05:
        count += 1
print(f'Simulated Type-I error rate for IS is {count/100}')
    

Shuffling Y: 100%|██████████| 100/100 [01:12<00:00,  1.39it/s]

Simulated Type-I error rate for IS is 0.04





## Our method

In [15]:
#1. Test without power enhancement
p_value = Test(X_e, Z_e, V_e, X_source, Z_source, V_source, Y_source, \
     X_target, Z_target, V_target, L=3, K=20, datatype='binary')
print(f'P-value for csPCR is {p_value}')
    

P-value for csPCR is 0.15300633499865168


In [10]:
#2. Test with power enhancement

p_value = Test_pe(X_e, Z_e, V_e, X_source, Z_source, V_source, Y_source, \
        X_target, Z_target, V_target, L=3, K=20, datatype='continuous')
print(f'P-value for csPCR(pe) is {p_value}')

[0.08218237 0.69528804 0.14130273]
P-value for csPCR(pe) is 1.0


### Use a different scoring function for testing V for the Power enhancement

In [17]:
# Originally, we use v*x for scoring, here we use (-v)*x to detect 
# the negative correlation between Y and V
p_value = Test_pe(X_e, Z_e, V_e, X_source, Z_source, V_source, Y_source, \
        X_target, Z_target, V_target, L=3, K=20, datatype='binary', score = 'neg')
print(f'P-value for csPCR(pe) with negative scoring function is {p_value}')

P-value for csPCR(pe) with negative scoring function is 0.7262152709691574


## Benchmark
there are 3 benchmarks:1. Use source only data, 2. Use target only data, 3. Importance sampling method (the benchmark from others)

In [18]:
#1. Use source data only
#2. Test with power enhancement

p_value = PCR_test(X_source,Z_source,V_source,Y_source)
print(f'P-value for PCR with source data is {p_value}')

P-value for PCR with source data is 0.6250775742972228


In [19]:
#2. Use target data only

p_value = PCR_test(X_target,Z_target,V_target,Y_target)
print(f'P-value for PCR with target data is {p_value}')

P-value for PCR with target data is 0.39537210168587544


In [21]:
#3. Use Importance Sampling benchmark method

p_value = IS_test(X_e, Z_e, V_e, X_source, Z_source, V_source, Y_source,\
        X_target, Z_target, V_target, L=3, K=20, datatype='binary')
print(f'P-value for IS is {p_value}')

P-value for IS is 0.7824830494337698


## Tune hyperparameter L

In [9]:
l_lst = [2, 5, 8, 10]
result_lst = []
for l in l_lst:
    # Use any test function above
    pvalue = Test_pe(X_e, Z_e, V_e, X_source, Z_source, V_source, Y_source, X_target, Z_target, V_target, L=3, K=20, datatype='binary')
    result_lst.append(pvalue)
    print(f'L is {l}, pvalue: {pvalue}')

Training accuracy for X|Z: 0.717
[0.23433064 0.19167032 0.28155873]
[167.95443516 165.08707733 174.75729811]
L is 2, pvalue: 0.933563916771966
Training accuracy for X|Z: 0.717
[0.2952632  0.20613484 0.31138347]
[140.64749537 187.91719237 174.08845475]
L is 5, pvalue: 0.06342484042790109
Training accuracy for X|Z: 0.717
[0.2715509  0.20274784 0.29688373]
[148.2607481  172.03479922 184.74858996]
L is 8, pvalue: 0.21636110362760874
Training accuracy for X|Z: 0.717
[0.29121139 0.13614264 0.34778643]
[146.78635223 166.57257311 184.67772408]
L is 10, pvalue: 0.195118444312963
