In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm, chi2, truncnorm
from tqdm import trange
from sklearn import linear_model
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from densratio import densratio

import matplotlib.pyplot as plt

# Functions

In [77]:
import numpy as np
from sklearn.linear_model import LogisticRegression

def density_ratio_estimate_prob(D_nu, D_de):
    l_nu = np.ones(len(D_nu))
    l_de = np.zeros(len(D_de))
    
    l = np.concatenate((l_nu, l_de))
    D = np.concatenate((D_nu, D_de))
    
    #fit losgistic model
    C = 0.1
    model = LogisticRegression(penalty= 'l2', C= C)
    model.fit(D, l)
    
    # get density ratios for all samples
    density_ratios = (model.predict_proba(D_de)[:, 1] / model.predict_proba(D_nu)[:, 0])*(len(D_de)/len(D_nu))
    
    return density_ratios


# def Covariate_Shift_Weight(x, z, v = 0):
#     return np.exp(((x - z @ s)**2 - (x - z @ t)**2)/2)

def Model_X(z, v):
    return z @ u + np.random.normal(0, 1, 1)

def T_statistic(y, x, z, v, reg):
    d_y = reg.predict(z.reshape(1,20))
    d_x = z @ u
    return np.abs((y-d_y)*(x-d_x))
    # return (y - z.sum()- x)**2

def Conterfeits(y, x, z, v, L, K, reg):
    M = L * K - 1
    cnt = 0
    t_stat = T_statistic(y, x, z, v, reg)
    
    for i in range(M):
        x_ = Model_X(z, v)
        if t_stat > T_statistic(y, x_, z, v, reg):
            cnt=cnt+1
            
    return cnt // K

def PCRtest(Y, X, Z, V, L, K, covariate_shift, density_ratio, reg):
    n = Y.size
    W = np.array([0.0]*L)

    for j in range(n):
        y, x, z, v = Y[j], X[j], Z[j], V[j]
        if covariate_shift == True:
            ind = Conterfeits(y, x, z, v, L, K, reg)
            W[ind] += density_ratio[j]
        if covariate_shift == False:
            W[Conterfeits(y, x, z, v, L, K, reg)] += 1     
    return W, L/n * np.dot(W - n/L, W - n/L)

def generate_cov_matrix(Y, X, Z, V, L, K, density_ratio, reg):
    """
    Generate a covariance matrix for quadratic form normal rv.

    Parameters:
    - L (int): The size of the covariance matrix.

    Returns:
    - covariance_matrix (ndarray): The generated covariance matrix.
    """
    n = Y.size
    diag = np.array([0.0]*L)
    
    for j in range(n):
        y, x, z, v = Y[j], X[j], Z[j], V[j]
        diag[Conterfeits(y, x, z, v, L, K, reg)] += (density_ratio[j]**2)
    diag = L*(diag/n)- 1/L
    covariance_matrix = np.full((L, L), -1/L)  # Fill all entries with 1/L
    np.fill_diagonal(covariance_matrix, diag)  # Set diagonal entries to 1 - 1/L^2
    return covariance_matrix
    

In [3]:
import scipy.stats as stats

def chi_squared_p_value(chi_squared_statistic, df):
    """
    Calculate the p-value from a chi-squared distribution.

    Parameters:
    - chi_squared_statistic (float): The observed chi-squared test statistic.
    - df (int): The degrees of freedom.

    Returns:
    - p_value (float): The calculated p-value.
    """
    p_value = 1 - stats.chi2.cdf(chi_squared_statistic, df)
    return p_value

def monte_carlo_p_value(n_samples, covariance_matrix, L, quantile):
    """
    Calculate the probability corresponding to a given quantile using the Monte Carlo method.

    Parameters:
    - n_samples (int): The number of Monte Carlo samples to generate.
    - covariance_matrix (ndarray): The covariance matrix of the random vector X.
    - L (int): The number of components to sum.
    - quantile (float): The quantile value.

    Returns:
    - probability (float): The estimated probability corresponding to the quantile.
    """
    count = 0
    for _ in range(n_samples):
        sample = np.random.multivariate_normal(np.zeros(L), covariance_matrix)
        squared_sum = np.sum(sample**2)
        if squared_sum <= quantile:
            count += 1

    probability = count / n_samples
    return 1-probability


# 生成数据

In [16]:
n, p = 100, 20
s = np.random.normal(0, 1, p)
t = s + np.random.normal(0, 0.1, p)
u = np.random.normal(0, 1, p)

In [161]:
# Generate Data

def generate(n, p, s, t, u, Alpha = 0):
    Z_source, Z_target = np.zeros((n, p)), np.zeros((n, p))
    V_source, V_target = 0, 0
    for i in range(n):
        Z_source[i] = np.random.normal(0, 1, p)
        Z_target[i] = np.random.normal(2, 1, p)
        
    X_source = Z_source @ u + np.random.normal(0, 1, n)
    X_target = Z_target @ u + np.random.normal(0, 1, n)

    V_source = Z_source @ s - X_source + np.random.normal(0, 1, n)
    V_target = Z_target @ t + X_target + np.random.normal(0, 1, n)
    
    Y_source, Y_target = np.zeros(n), np.zeros(n)
    for i in range(n):
        Y_source[i] = Z_source[i].sum() + X_source[i] + V_source[i] + np.random.normal(0, 1, 1) + Alpha * X_source[i]
        Y_target[i] = Z_target[i].sum() + X_target[i] + V_target[i] + np.random.normal(0, 1, 1) + Alpha * X_target[i]
    return Y_source.reshape(-1,1), X_source.reshape(-1,1), V_source.reshape(-1,1), Z_source,\
           Y_target.reshape(-1,1), X_target.reshape(-1,1), V_target.reshape(-1,1), Z_target

# Test procedure

In [162]:
K = 30
L = 10
n, p = 1000, 20
s = np.random.normal(0, 1, p)
t = s + np.random.normal(0, 0.1, p)
u = np.random.normal(0, 1, p)

In [163]:
#generate data
Y_source, X_source, V_source, Z_source, Y_target, X_target, V_target, Z_target = generate(n, p, s, t, u, 0)

# calculate density ratio
D_s = np.concatenate((X_source, Z_source, V_source), axis = 1)
D_t = np.concatenate((X_target, Z_target, V_target), axis = 1)
densratio_obj = densratio(D_t, D_s)

#calculate density ratio for each sample
sample_density_ratio = densratio_obj.compute_density_ratio(D_s)

# distill information of Z on Y
reg = LassoCV().fit(Z_source,Y_source)

RuLSIF starting...
Searching for the optimal sigma and lambda...
sigma = 0.00100, lambda = 0.00100, score = 0.00000
sigma = 0.00100, lambda = 0.01000, score = 0.00000
sigma = 0.00100, lambda = 0.10000, score = 0.00000
sigma = 0.00100, lambda = 1.00000, score = -0.00000
sigma = 0.00100, lambda = 10.00000, score = 0.00000
sigma = 0.00100, lambda = 100.00000, score = 0.00000
sigma = 0.00100, lambda = 1000.00000, score = 0.00000
sigma = 0.00100, lambda = 10000.00000, score = -0.00000
sigma = 0.00100, lambda = 100000.00000, score = 0.00000
sigma = 0.00100, lambda = 1000000.00000, score = 0.00000
sigma = 0.00100, lambda = 10000000.00000, score = 0.00000
sigma = 0.00100, lambda = 100000000.00000, score = 0.00000
sigma = 0.00100, lambda = 1000000000.00000, score = 0.00000
sigma = 0.01000, lambda = 0.00100, score = 0.00000
sigma = 0.01000, lambda = 0.01000, score = 0.00000
sigma = 0.01000, lambda = 0.10000, score = 0.00000
sigma = 0.01000, lambda = 1.00000, score = -0.00000
sigma = 0.01000, lam

  y = column_or_1d(y, warn=True)


In [164]:
print(max(sample_density_ratio))
print(min(sample_density_ratio))
mean = np.mean(sample_density_ratio)
print(mean)
sample_density_ratio = sample_density_ratio/mean

2.3694437717232137
0.30300551623361993
1.5061475866848595


In [165]:
PCRtest(Y_target, X_target, Z_target,V_target, L = 5, K = 30, covariate_shift = False, density_ratio = sample_density_ratio, reg = reg)

(array([178., 200., 182., 228., 212.]), 8.68)

In [166]:
PCRtest(Y_source, X_source, Z_source,V_source, L = 5, K = 30, covariate_shift = False, density_ratio = sample_density_ratio, reg = reg)

(array([215., 219., 188., 195., 183.]), 5.22)

In [62]:
chi_squared_p_value(65, 4)

2.573496971081113e-13

In [None]:
# verification by the p value
count = 0
for j in trange(100):
    
    cov1 = generate_cov_matrix(Y_source, X_source, Z_source,V_source, L = 20, K =30, density_ratio = sample_density_ratio, reg = reg)
    w, statistic = PCRtest(Y_source, X_source, Z_source,V_source, L = 20, K = 30, covariate_shift = True, density_ratio = sample_density_ratio, reg = reg)
    print(statistic)
    p_value = monte_carlo_p_value(100000, cov1, 20, statistic)
    print(p_value)
    if p_value < 0.1:
        count += 1
probability = count/100


  0%|          | 0/100 [00:00<?, ?it/s]

In [79]:
w

array([42.97221836, 41.13751691, 62.58925636, 58.42852195, 49.19229459,
       58.71867837, 57.38112057, 51.79035496, 51.23925774, 63.06646426,
       57.83586488, 39.78639711, 50.70822744, 35.20637234, 47.84120816,
       52.98122953, 55.47235177, 73.71218198, 49.14257705, 62.86486911])

In [531]:
PCRtest(Y_source, X_source, Z_source, V_source, L = 10, K = 20, covariate_shift = True)

(array([210.42659433, 194.17036367, 205.77078838, 199.36556849,
        226.40790323, 202.13616933, 211.62012152, 213.57050006,
        199.14345193, 194.62128313]),
 6.135967845352278)

In [None]:
*write the function

In [None]:
def multiple_test():

In [17]:
# previous code

# def int_(x):
#     if x >= 80: return 79
#     return int(x)

# def Many_Tests(m, Alpha):
#     X = []
#     n, p=500, 20
#     for i in trange(m):
#         s = np.random.normal(0, 1, p)
#         t = s + np.random.normal(0, 0.1, p)
#         u = np.random.normal(0, 1, p)
#         Y_source, X_source, V_source, Z_source, Y_target, X_target, V_target, Z_target = generate(n, p, s, t, u, Alpha)
#         u, v = PCRtest(Y_source, X_source, Z_source, L = 5, K = 20, covariate_shift = False)
#         X.append(v)
#     return X
# def Density_Variance(n):
#     a, b=0, 0
#     for i in range(n):
#         z = np.random.normal(0, 1, p)
#         x = z @ s + np.random.normal(0, 1, 1)
#         a += Covariate_Shift_Weight(x, z)**2
#         b += np.exp((z @ t - z @ s)**2)
#     return a/n, b/n

# Power simulation

1.First, we set the rejection threshold to be 0.05. We will run 1000 simulations to estimate the power and average the results over 1000 trials. Plot the power w.r.t L with fixed K = 50.

In [None]:
# simulation code
K = 20
L = 10
n, p = 1000, 20
s = np.random.normal(0, 1, p)
t = s + np.random.normal(0, 0.1, p)
u = np.random.normal(0, 1, p)

result = []
for l in range(4, 30):
    for i in range
    cov1 = generate_cov_matrix(Y_source, X_source, Z_source, L = l, K = 20)
    result.append(monte_carlo_p_value(100000, cov1, l, PCRtest(Y_source, X_source, Z_source, L = l, K = 20, covariate_shift = True)[1]))
    
    