In [1]:
import numpy as np
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.neural_network import MLPRegressor

  from .autonotebook import tqdm as notebook_tqdm


## utils

In [63]:
def rmse(matrix1, matrix2):
    # 确保两个矩阵具有相同的维度
    if matrix1.shape != matrix2.shape:
        raise ValueError("Both matrices must have the same dimensions.")
    
    # 计算差异
    diff = matrix1 - matrix2
    
    # 计算均方误差（MSE）
    mse = torch.mean(diff ** 2)
    
    # 计算RMSE
    rmse = torch.sqrt(mse)
    
    return rmse.item()

def param_diff_loss(model, lambda_diff):
    # 提取各个逻辑回归层的权重
    w1 = model.log1.weight
    w2 = model.log2.weight
    w3 = model.log3.weight

    # 计算权重之间的绝对差值
    diff12 = torch.abs(w1 - w2).sum()
    diff23 = torch.abs(w2 - w3).sum()
    diff31 = torch.abs(w3 - w1).sum()

    # 返回加权的参数差异损失
    return lambda_diff * (diff12 + diff23 + diff31)

def gating_diff_loss(weights, lambda_gate):
    # 计算权重差异的绝对值
    diff = torch.abs(torch.max(weights, dim=1)[0]-torch.min(weights, dim=1)[0]).sum()
    # diff = (torch.abs(weights[:, :-1] - weights[:, 1:]).sum()+torch.abs(weights[:, -1] - weights[:, 0]).sum())
    # 返回加权的差异损失
    return -lambda_gate * diff/(weights.shape[0])


class GatingNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_experts, dim_y):
        super(GatingNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 2*hidden_size)
        self.fc3 = nn.Linear(2*hidden_size, num_experts)

        # 定义三个独立的逻辑回归模型作为专家
        self.log1 = nn.Linear(input_size, dim_y)
        self.log2 = nn.Linear(input_size, dim_y)
        self.log3 = nn.Linear(input_size, dim_y)

    def forward(self, x):
        # 门控网络部分
        x1 = F.relu(self.fc1(x))
        x1 = F.relu(self.fc2(x1))
        weights = F.softmax(self.fc3(x1), dim=1)  # 得到每个专家的权重

        # 专家的输出
        y1 = torch.sigmoid(self.log1(x)) * weights[:, 0].unsqueeze(1)  # 维度对齐
        y2 = torch.sigmoid(self.log2(x)) * weights[:, 1].unsqueeze(1)
        y3 = torch.sigmoid(self.log3(x)) * weights[:, 2].unsqueeze(1)

        # 合并各专家的加权输出
        return y1 + y2 + y3,weights
    
class GatingNetwork1(nn.Module):
    def __init__(self, input_size, dim_y):
        super(GatingNetwork1, self).__init__()
        self.log1 = nn.Linear(input_size, dim_y)

    def forward(self, x):
        # 专家的输出
        y1 = torch.sigmoid(self.log1(x)) # 维度对齐
        # 合并各专家的加权输出
        return y1


def train(model, optimizer, x_train, y_train, theta,tolerance=1e-4, max_epochs=1000,lam1=1,lam2=1):
    x_train = Variable(x_train, requires_grad=False)
    y_train = Variable(y_train, requires_grad=False)

    # last_loss = float('inf')
    for epoch in range(max_epochs):
        # 存储参数更新前的副本
        param_copies = {name: p.clone().detach() for name, p in model.named_parameters()}
        
        optimizer[0].zero_grad()
        outputs,weights = model(x_train)
        
        # loss = criterion(outputs, y_train)+param_diff_loss(model,lam1)
        loss = ((outputs-y_train)**2).sum()/(y_train.shape[0]*y_train.shape[1])+param_diff_loss(model,lam1)
        loss.backward()
        optimizer[0].step()
        
        optimizer[1].zero_grad()
        outputs,weights = model(x_train)
        loss1 =  ((outputs-y_train)**2).sum()/(y_train.shape[0]*y_train.shape[1])+gating_diff_loss(weights, lam2)
        
        loss1.backward()
        optimizer[1].step()

        # 计算参数变化
        total_change = sum((p - param_copies[name]).abs().sum().item() for name, p in model.named_parameters())
        
        if (epoch+1)%100 == 0:
            print(f'Epoch {epoch+1}, Loss: {(loss+loss1).item()}, Max param change: {total_change} rmse:{rmse(model(x_train)[0],theta)}')
            # print(gating_diff_loss(weights, lam2),weights.shape)
        
        if total_change < tolerance:
            print("Training converged.")
            break
        
def train1(model, optimizer, x_train, y_train, theta, max_epochs=1000):
    x_train = Variable(x_train, requires_grad=False)
    y_train = Variable(y_train, requires_grad=False)

    # last_loss = float('inf')
    for epoch in range(max_epochs):
        # 存储参数更新前的副本
        param_copies = {name: p.clone().detach() for name, p in model.named_parameters()}
        
        optimizer.zero_grad()
        outputs = model(x_train)
        
        # lossf = torch.nn.CrossEntropyLoss()
        loss = ((outputs-y_train)**2).sum()/(y_train.shape[0]*y_train.shape[1])
        # loss = lossf(outputs,y_train)
        loss.backward()
        optimizer.step()

        # 计算参数变化
        total_change = sum((p - param_copies[name]).abs().sum().item() for name, p in model.named_parameters())
        
        if (epoch+1)%100 == 0:
            print(f'Epoch {epoch+1}, Loss: {(loss).item()}, Max param change: {total_change} rmse:{rmse(model(x_train),theta)}')
            # print(gating_diff_loss(weights, lam2),weights.shape)
        if total_change < 1e-5:
            print("Training converged.")
            break


def px(x):
    p = x@torch.inverse(x.T@x)@x.T
    return torch.eye(p.shape[0],p.shape[1])-p



def step2(x,theta,y,M,lam1=1,lam2=1,a=1):
    beta = torch.inverse(x.T@x+lam1*torch.eye(x.shape[1],x.shape[1]))@x.T@(torch.mul(torch.mul(M,theta),y))
    
    b1 = 1/(1+2*(y.shape[0]*y.shape[1]*lam2/2)*(1-a))
    b2 = px(x)@(torch.mul(torch.mul(M,theta),y))
    a1,b,c = torch.linalg.svd(b2, full_matrices=False)
    b = b-a*(y.shape[0]*y.shape[1])*lam2/2
    for i in range(b.shape[0]):
        b[i] = b[i] if b[i].item()>0 else 0
    b = torch.diag_embed(b, 0, -2, -1)[:a1.size(0), :c.size(1)]
    b = a1@b@c*b1
    return beta,b
        

## 参数设置及模拟数据生成

In [33]:
# 混合模型数量
K = 3
# 样本量
m = 400
# 协变量特征纬度
n1 = 20
# 观测矩阵维度
n2 = 400

# 协变量矩阵的生成
x = torch.randn(m,n1)
# 模型参数生成
w = torch.normal(0.3,0.01,(2,K,n1,n2))

b = torch.normal(-1.5,0.01,(K,n2))
#观测矩阵非缺失概率计算
theta = torch.zeros((m,n2))
y = torch.zeros((m,n2))
b0 = px(x)@torch.randn(m,10)@torch.randn(10,n2)
for i in range(m):
    k = np.random.choice(list(range(K)))
    theta[i,:] = torch.sigmoid(x[i,:]@w[0,k,:,:]+b[k,:])
    y[i,:]= x[i,:]@w[1,0,:,:]+b0[i]
    
    
noise = torch.normal(0,((y-torch.mean(y))**2).sum()/(m*n2-1),(m,n2))
#根据概率生成对应的示性矩阵
M = torch.zeros((m,n2))
for i in range(m):
    for j in range(n2):
        M[i,j] = 1 if np.random.uniform(0,1) <= theta[i,j] else 0

In [35]:
torch.mean(theta)

tensor(0.2345)

In [36]:
np.linalg.matrix_rank(b0)

10

## Step1: Estimation of $\hat{\theta}$ by mixed logistic regression

## Step2

In [51]:
gating_network = GatingNetwork(n1,10,3,n2)
params_to_update = list(gating_network.log1.parameters()) + \
                   list(gating_network.log2.parameters()) + \
                   list(gating_network.log3.parameters())
optimizer = torch.optim.Adam(params_to_update, lr=0.1)

params_to_update1 = list(gating_network.fc1.parameters()) + \
                   list(gating_network.fc2.parameters()) + \
                   list(gating_network.fc3.parameters())
optimizer1 = torch.optim.Adam(params_to_update1, lr=0.1)
train(gating_network, [optimizer,optimizer1], x, M,theta=theta,tolerance=1e-3, max_epochs=1000,lam1=100000,lam2=0.01)


Epoch 100, Loss: 35549420.0, Max param change: 171.7458719909191 rmse:0.1939687281847
Epoch 200, Loss: 33063674.0, Max param change: 167.15387245453894 rmse:0.19379764795303345
Epoch 300, Loss: 32862960.0, Max param change: 164.71856048703194 rmse:0.19372862577438354
Epoch 400, Loss: 32546252.0, Max param change: 164.5410539219156 rmse:0.19370025396347046
Epoch 500, Loss: 32721902.0, Max param change: 162.84946984238923 rmse:0.19378426671028137
Epoch 600, Loss: 32813868.0, Max param change: 161.4576179459691 rmse:0.19366776943206787
Epoch 700, Loss: 32746408.0, Max param change: 163.47981677297503 rmse:0.19360977411270142
Epoch 800, Loss: 32690972.0, Max param change: 164.1170417163521 rmse:0.19368976354599
Epoch 900, Loss: 32452222.0, Max param change: 161.72854388784617 rmse:0.1935988813638687
Epoch 1000, Loss: 32227192.0, Max param change: 160.58252535853535 rmse:0.19355902075767517


In [64]:
gating_network1 = GatingNetwork1(n1,n2)
params_to_update = list(gating_network1.log1.parameters())
optimizer = torch.optim.Adam(params_to_update, lr=0.1)
train1(gating_network1, optimizer, x, M,theta=theta, max_epochs=1000)

Epoch 100, Loss: 0.12595325708389282, Max param change: 4.564533472061157 rmse:0.11738263815641403
Epoch 200, Loss: 0.12592995166778564, Max param change: 0.44202226400375366 rmse:0.11730387061834335
Epoch 300, Loss: 0.12592314183712006, Max param change: 0.27587613463401794 rmse:0.11796247959136963
Epoch 400, Loss: 0.12591825425624847, Max param change: 0.22762538492679596 rmse:0.11842868477106094
Epoch 500, Loss: 0.12591366469860077, Max param change: 0.21742883324623108 rmse:0.11881932616233826
Epoch 600, Loss: 0.125908762216568, Max param change: 0.20900645852088928 rmse:0.11914129555225372
Epoch 700, Loss: 0.12590429186820984, Max param change: 0.20194783806800842 rmse:0.11939924955368042
Epoch 800, Loss: 0.12590032815933228, Max param change: 0.19979135692119598 rmse:0.11961182951927185
Epoch 900, Loss: 0.12589658796787262, Max param change: 0.19168363511562347 rmse:0.11979876458644867
Epoch 1000, Loss: 0.12589222192764282, Max param change: 0.25402073562145233 rmse:0.11998186260

In [79]:
theta_hat = gating_network(x)[0]
beta,b = step2(x,1/(theta_hat+0.01),torch.mul(y+noise,M),M,lam1=10000,lam2=0.01,a=0.06)
print(rmse(x@beta+b,y))

theta_hat = gating_network1(x)
beta,b = step2(x,1/(theta_hat+0.01),torch.mul(y+noise,M),M,lam1=10000,lam2=0.01,a=0.06)
print(rmse(x@beta+b,y))

3.3718421459198
3.4501516819000244


In [43]:
torch.mean(theta_hat)

tensor(0.2229, grad_fn=<MeanBackward0>)

In [30]:
# # weight = gating_network(x)[1]
# theta_hat = gating_network(x)[0]
# # index = torch.argmin(weight,axis=1)
# # y_hat = torch.zeros(m,n2)
# # for i in range(max(index)+1):
# #     data_index = np.where(index==i)[0].tolist()
# #     x_part = x[data_index]
# #     y_part = torch.mul(y,M)[data_index]
# #     M_part = M[data_index]
# #     theta_hat_part = 1/theta_hat[data_index]
# #     beta,b = step2(x_part,theta_hat_part,y_part+noise[data_index],M_part,1000,0.001,0.6)
# #     y_hat[data_index] = x_part@beta+b

# # print(rmse(y_hat,y))
# beta,b = step2(x,1/theta_hat,torch.mul(y+noise,M),M,lam1=100000,lam2=0.1,a=0.06)
# print(rmse(x@beta+b,y))
# # diff = torch.max(weight,axis=1)[0]
# # for i in range(m):
# #     y_hat[i] = (1-diff[i])*y_hat[i]+(diff[i])*(x@beta+b)[i]
# # print(rmse(y_hat,y))

3.391761064529419
