In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm, chi2, truncnorm
from tqdm import trange
from sklearn import linear_model
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso

import matplotlib.pyplot as plt

## 生成数据

In [5]:
n, p=500, 20
s = np.random.normal(0, 1, p)
t = s + np.random.normal(0, 0.1, p)
u = np.random.normal(0, 1, p)

In [6]:
def generate(n, p, s, t, u, Alpha = 0):
    Z_source, Z_target = np.zeros((n, p)), np.zeros((n, p))
    V_source, V_target = 0, 0
    for i in range(n):
        Z_source[i] = np.random.normal(0, 1, p)
        Z_target[i] = np.random.normal(0.1, 1, p)
        
    X_source = Z_source @ u + np.random.normal(0, 1, n)
    X_target = Z_target @ u + np.random.normal(0, 1, n)
    
    V_source = Z_source @ s + X_source + np.random.normal(0, 1, n)
    V_target = Z_target @ t - X_target + np.random.normal(0, 1, n)
    
    Y_source, Y_target = np.zeros(n), np.zeros(n)
    for i in range(n):
        Y_source[i] = np.sin(Z_source[i].sum() + X_source[i] + V_source[i]) + np.random.normal(0, 1, 1) + Alpha * X_source[i]
        Y_target[i] = np.sin(Z_target[i].sum() + X_target[i] + V_target[i]) + np.random.normal(0, 1, 1) + Alpha * X_target[i]
    return Y_source, X_source, V_source, Z_source, Y_target, X_target, V_target, Z_target

In [7]:
Y_source, X_source, V_source, Z_source, Y_target, X_target, V_target, Z_target = generate(n, p, s, t, u, 0)

In [8]:
reg = LassoCV().fit(Z_source,Y_source)
Y_source[3]

1.8846445338158873

In [9]:
X_source.shape

(500,)

In [10]:
def Covariate_Shift_Weight(x, z, v = 0):
    return np.exp(((x - z @ s)**2 - (x - z @ t)**2)/2)

def Model_X(z, v):
    return z @ u + np.random.normal(0, 1, 1)

def T_statistic(y, x, z, v = 0):
    d_y = reg.predict(z.reshape(1,20))
    d_x = z @ u
    return np.abs((y-d_y)*(x-d_x))

In [11]:
T_statistic(Y_source[1], X_source[1], Z_source[1])

array([1.00151025])

In [12]:
for i in range(10):
    x=X_source[i]
    z=Z_source[i]
    t=T_statistic(Y_source[i], X_source[i], Z_source[i])
    print(t)

[0.62883561]
[1.00151025]
[4.64956768]
[2.11225433]
[1.38612382]
[1.46191983]
[1.03372469]
[0.08014713]
[3.10903179]
[1.37132904]


In [13]:
def Conterfeits(y, x, z, v = 0, L = 5, K = 10):
    M = L * K - 1
    cnt = 0 
    
    for i in range(M):
        x_ = Model_X(z, v)
        if T_statistic(y, x, z, v) > T_statistic(y, x_, z, v):
            cnt=cnt+1
            
    return cnt // K

def PCRtest(Y, X, Z, V = 0, L = 5, K = 20, covariate_shift = True):
    n = Y.size
    W = np.array([0.0]*L)
    
    for j in range(n):
        y, x, z, v = Y[j], X[j], Z[j], 0
        if covariate_shift == True:
            W[Conterfeits(y, x, z, v, L, K)] += Covariate_Shift_Weight(x, z, v)
        if covariate_shift == False:
            W[Conterfeits(y, x, z, v, L, K)] += 1
            
    return W, L/n * np.dot(W - n/L, W - n/L)

In [16]:
PCRtest(Y_target, X_target, Z_target, L = 5, K = 20, covariate_shift = False)

(array([125.,  92.,  94.,  88., 101.]), 8.700000000000001)

In [15]:
PCRtest(Y_source, X_source, Z_source, L = 5, K = 20, covariate_shift = False)

(array([ 95., 113.,  89., 100., 103.]), 3.24)

In [88]:
def int_(x):
    if x >= 80: return 79
    return int(x)

def Mutiple_Tests(m, Alpha):
    X = []
    n, p=500, 20
    for i in trange(m):
        s = np.random.normal(0, 1, p)
        t = s + np.random.normal(0, 0.1, p)
        u = np.random.normal(0, 1, p)
        Y_source, X_source, V_source, Z_source, Y_target, X_target, V_target, Z_target = generate(n, p, s, t, u, Alpha)
        u, v = PCRtest(Y_source, X_source, Z_source, L = 5, K = 20, covariate_shift = False)
        X.append(v)
    return X

In [89]:
H_0 = Mutiple_Tests(10, 1)
H_1 = Mutiple_Tests(10, 0)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:59<00:00,  5.92s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:59<00:00,  5.93s/it]


In [92]:
H_1.sort()
H_0.sort()

In [94]:
H_0

[1187.14,
 1188.6200000000001,
 1243.34,
 1281.16,
 1304.5,
 1321.94,
 1329.54,
 1386.7,
 1412.06,
 1446.14]

In [70]:
def Density_Variance(n):
    a, b=0, 0
    for i in range(n):
        z = np.random.normal(0, 1, p)
        x = z @ s + np.random.normal(0, 1, 1)
        a += Covariate_Shift_Weight(x, z)**2
        b += np.exp((z @ t - z @ s)**2)
    return a/n, b/n

In [71]:
Density_Variance(100000)

(array([1.14891235]), 1.161448981209541)

In [19]:
a=[]
for i in range(50):
    b=PCRtest(Y_source, X_source, Z_source, L = 5, K = 20, covariate_shift = False)
    a.append(b)

In [20]:
a.sort
a

[(array([107., 117.,  85., 100.,  91.]), 6.44),
 (array([108.,  97., 101., 108.,  86.]), 3.34),
 (array([103., 115.,  93.,  98.,  91.]), 3.68),
 (array([112.,  98.,  96., 103.,  91.]), 2.54),
 (array([107., 108.,  86., 116.,  83.]), 8.540000000000001),
 (array([105., 108., 100., 103.,  84.]), 3.54),
 (array([102., 119.,  91., 101.,  87.]), 6.16),
 (array([107., 106.,  97.,  99.,  91.]), 1.76),
 (array([102., 111.,  99., 102.,  86.]), 3.2600000000000002),
 (array([106., 107.,  97.,  99.,  91.]), 1.76),
 (array([109., 102.,  91., 115.,  83.]), 6.8),
 (array([109.,  97., 102., 107.,  85.]), 3.68),
 (array([107., 102.,  97., 118.,  76.]), 9.620000000000001),
 (array([106., 106.,  94.,  97.,  97.]), 1.26),
 (array([103., 111., 105., 100.,  81.]), 5.16),
 (array([105., 108.,  95., 103.,  89.]), 2.44),
 (array([108., 106., 106.,  95.,  85.]), 3.86),
 (array([109., 108.,  84., 113.,  86.]), 7.66),
 (array([103., 113.,  98.,  98.,  88.]), 3.3000000000000003),
 (array([102., 108.,  99., 104.,  8