In [51]:
import os

import shutil

import torch
from collections import Counter
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import os
import pandas as pd

from joblib import Parallel, delayed
import argparse
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

# folder_path = 'model1/'

# Create the folder if it doesn't exist
# os.makedirs(folder_path, exist_ok=True)

import numpy as np
import torch


def f_1(x):
    return x[:, 0] + 0.25 * x[:, 1] ** 2+0.1*torch.tanh(0.5*x[:,2]-0.3)

# def f_2(x):
#     return x[:,0]**2/2-abs(x[:,4]*x[:,9])+torch.exp(0.1*x[:,14])-torch.sin(3.141592*x[:,19])
# def f_2(x):
#     return 2*(x[:,0]>0)-2*(x[:,0]<0)

def poisson_loss(logits,y_true):
    """
    Compute the Poisson negative log-likelihood loss.
    
    Args:
        y_true (torch.Tensor): True labels (0, 1, 2, ...), shape (batch_size,).
        logits (torch.Tensor): Output of the DNN (before exponentiation), shape (batch_size,).
    
    Returns:
        torch.Tensor: Mean negative log-likelihood loss over the batch.
    """
    # Convert logits to λ(x) = e^logits
    lambda_pred = torch.exp(logits)
    
    # Compute the negative log-likelihood
    loss = lambda_pred - y_true * logits  # Equivalent to λ(x) - Y * log(λ(x))
    return loss.mean()

class SampleSet:
    def __init__(self, n, p,f_X,module='Bernoulli', mean=0, std=1,trials=None):
        """
        Initializes the SampleSet with n samples and p features for X.
        Y is generated based on the conditional probability P(Y=1|X).
        """
        self.n = n
        self.p = p
        self.mean = mean
        self.std = std
        self.r=None
        self.B=None
        self.module=module
        # Generate X with dimension (n, p)
        X_main = torch.normal(0.0, 1.0, size=(n, 2))
        X_noise = torch.normal(0.0, 1, size=(n, p - 2))
        self.X = torch.cat([X_main, X_noise], dim=1)
        # self.X=2*torch.rand((n, p)) -2
        self.subtrain=None
        self.subval=None
        self.counts=None
        self.trials=trials
        self.f_X=f_X
        # Compute z = f(X) and use it to generate P(Y=1|X) and Y
        self.z = self._compute_z(self.X)  # Save z values (f(X))
        self.Y = self._generate_Y(self.z)
    
    def _compute_z(self, X):
        return   self.f_X(X)
    
    def _generate_Y(self, z):
        if self.module=='Bernoulli':
            # Generate Y as a Bernoulli random variable with probability P(Y=1|X)
            P_Y_given_X = 1 / (1 + torch.exp(-z))
            Y = torch.bernoulli(P_Y_given_X)
        elif self.module=='Gaussian':
            Y = torch.normal(mean=z, std=1.0)
        elif self.module=='Binomial':
            if self.trials is None:
                self.trials = torch.randint(low=5, high=6, size=(self.n,))  

            P_Y_given_X = 1 / (1 + torch.exp(-z)) # Ensure the rate parameter is positive
            Y = torch.binomial(self.trials.float(), P_Y_given_X)

        elif self.module == 'Poisson':
        # Generate Y as a Poisson random variable with rate parameter (lambda) equal to exp(z)
            rate_param = torch.log(1+torch.exp(z))   # Ensure the rate parameter is positive
            Y = torch.poisson(rate_param)
        else:
        # Raise an error for unsupported modules
            raise ValueError(f"Unsupported module type: {self.module}. Expected one of: 'Bernoulli', 'Gaussian', 'Exponential', 'Poisson'.")
        return Y
    
    def get_z(self):
        """Returns the computed z values, which represent f(X)."""
        return self.z
    
    def get_sample_set(self):
        """Returns the main sample set (X, Y)."""
        return self.X, self.Y
    
    def get_sub_samples_with_validation(self, B, r):
        """
        Generates B sub-sample sets, each containing r samples randomly selected 
        from the main sample set, along with corresponding validation sets.
        
        Also counts the number of times each index is selected across all B sub-samples.
        
        Returns:
            train_samples: List of tuples, each containing (train_X, train_Y, train_indices)
            validation_samples: List of tuples, each containing (val_X, val_Y, val_indices)
            selection_counts: Dictionary with counts of each index's appearance in the B sub-samples.
        """
        train_samples = []
        validation_samples = []
        selection_counts = Counter({i: 0 for i in range(self.n)})  # To track appearances of each index
        indices = torch.arange(self.n)
        self.B=B
        self.r=r
        for _ in range(B):
            # Randomly select r unique indices for the sub-sample
            selected_indices = indices[torch.randperm(self.n)[:r]]
            
            # Update selection count for each index
            selection_counts.update(selected_indices.tolist())
            
            # Get validation indices (those not in selected_indices)
            val_indices = torch.tensor([i for i in indices if i not in selected_indices])

            # Separate sub-sample and validation sets, including original indices
            X_sub = self.X[selected_indices]
            Y_sub = self.Y[selected_indices]
            X_val = self.X[val_indices]
            Y_val = self.Y[val_indices]
            
            # Append to train_samples and validation_samples lists
            train_samples.append((X_sub, Y_sub, selected_indices))
            validation_samples.append((X_val, Y_val, val_indices))
        self.subtrain=train_samples
        self.subval=validation_samples
        self.counts=dict(selection_counts)
        return train_samples, validation_samples, dict(selection_counts)
    
    def save(self, file_path):
        """Saves the SampleSet instance to a file."""
        torch.save(self, file_path)
    
    @staticmethod
    def load(file_path):
        """Loads a SampleSet instance from a file."""
        return torch.load(file_path)



def clear_folder(folder_path):
    if not os.path.exists(folder_path):
        print(f"folder {folder_path} does not exist")
        return


    for item in os.listdir(folder_path):
        item_path = os.path.join(folder_path, item)

        if os.path.isfile(item_path):
            os.remove(item_path)

        elif os.path.isdir(item_path):
            shutil.rmtree(item_path)



def train_multiple_RF(sample_set, mode="Bernoulli"):
    train_samples, validation_samples, selection_counts = sample_set.subtrain, sample_set.subval, sample_set.counts
    
    models = []  
    for i, (train_data, val_data) in enumerate(zip(train_samples, validation_samples)):
        print(i)
        X_sub, Y_sub, _ = train_data
        X_val, Y_val, _ = val_data
        if mode=="Bernoulli":

            rf = RandomForestClassifier(n_estimators=200,criterion="entropy",   
                 random_state=42)
            
            rf.fit(X_sub, Y_sub)

            # 预测概率
            models.append(rf)
            # p_hat = (rf.predict_proba(X_val)[:, 1])[0:30]
            # print('hat:',p_hat)
            # p_true=torch.sigmoid(f_1(X_val))[0:30]
            # print('ture:',p_true)

        elif mode == "Poisson":
            rf = RandomForestRegressor(n_estimators=200,criterion="poisson", random_state=42)
            rf.fit(X_sub.numpy(), Y_sub.numpy()) 
            models.append(rf)
            

        


        # Validation performance (optional)


    
    return models





def ensemble_predict_batch_f(Xtest, models, sample_set):
    ntest = Xtest.shape[0]
    n = sample_set.n       # Total number of original samples
    r = sample_set.r       # Size of each sub-sample
    B = len(models)      # Number of sub-samples (number of neural networks)
    mtype=sample_set.module
    # Collect logits from all networks for the test set (shape: [ntest, B])
    all_outputs = torch.zeros(ntest, B)
    if mtype == "Bernoulli":   
        for j, net in enumerate(models):
            p_hat = net.predict_proba(Xtest)[:, 1]   # ndarray
            print(p_hat)
            all_outputs[:, j] = torch.tensor(p_hat)
            # 

    elif mtype == "Poisson":  
        for j, net in enumerate(models):              
            lam_hat = net.predict(Xtest)            # ndarray
            all_outputs[:, j] = torch.tensor(lam_hat)
            # all_outputs=torch.log(all_outputs)

    if mtype == "Bernoulli":  
        all_outputs=torch.log(all_outputs/(1-all_outputs))
    elif mtype == "Poisson":  
        all_outputs=torch.log(all_outputs)


    # Compute inclusion counts J_bji and mean inclusion J_dot_i for each training index i
    J_bji = sample_set.counts  # Dict mapping i -> count of i in each sub-sample
    J_dot_i = {i: J_bji[i] / B for i in range(n)}

    # Ensemble mean prediction for each test sample
    hatf_B = all_outputs.mean(dim=1)  # Shape: [ntest]

    # Initialize accumulators for variance correction terms
    sum_V2 = torch.zeros(ntest)      # Accumulate sum of hat_V_i^2 over i
    sum_Zdiff2 = torch.zeros(ntest)  # Accumulate sum of (Z_ji - hat_V_i)^2 over i and j

    # Loop over each original data index i
    for i in range(n):
        # Gather Z_{b_j i}(x*) for all sub-samples j (shape: [B, ntest])
        Zs = torch.zeros(B, ntest)
        for j in range(B):
            _, _, Jbjicount = sample_set.subtrain[j]
            in_subset = 1.0 if (i in Jbjicount) else 0.0
            deviations = all_outputs[:, j] - hatf_B  # Shape: [ntest]
            Zs[j] = (in_subset - J_dot_i[i]) * deviations

        # Compute hat_V_i(x*) and accumulate
        hat_V_i = Zs.mean(dim=0)  # Shape: [ntest]
        sum_V2 += hat_V_i.pow(2)
        sum_Zdiff2 += (Zs - hat_V_i.unsqueeze(0)).pow(2).sum(dim=0)

    # Correction factor: n(n-1)/(n-r)^2
    factor = (n - 1) / n * (n / (n - r))**2

    # Compute corrected variance terms
    term1 = factor * sum_V2
    term2 = factor * sum_Zdiff2 / (B * (B - 1))
    var_f = term1 - term2          # Bias-corrected variance estimate

    # Standard deviations
    sd_f_raw = torch.sqrt(term1)       # Without bias correction
    sd_f_correct = torch.sqrt(var_f)   # With bias correction


    return all_outputs, [hatf_B, sd_f_raw, sd_f_correct]



def run_one_repeat(rep_id, n, r, B, p, GLM_name, f_1, xtest):
    # 每个 repeat 训练 B 个网络，最后做一次 ensemble 预测
    ss = SampleSet(n, p, f_1, module=GLM_name)
    ss.get_sub_samples_with_validation(B, r)

    
    models=train_multiple_RF(ss, mode=GLM_name)
    _, Bf = ensemble_predict_batch_f(xtest, models, ss)
    return Bf[0], Bf[1],Bf[2]











In [52]:

if __name__ == "__main__":
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--n",     type=int,   required=True)
    # parser.add_argument("--index", type=float, required=True)
    # args = parser.parse_args()
    n=700
    index_e=0.9

######Constant Area#######
    # n        = args.n
    # index_e  = args.index
    r        = int(n ** index_e)
    B        = 3            # or your B
    p        = 10           # or your p
    GLM_name = "Bernoulli"  # or your mode      
    folder = 'resultspart1'  
    xtest    = torch.load(f"xtest10.pt")  # 预先生成并保存
    ss = SampleSet(n, p, f_1, module=GLM_name)
    ss.get_sub_samples_with_validation(B, r)

    
    Af, Bf = ensemble_predict_batch_f(xtest, models, ss)


0


1
2
[0.7   0.765 0.34  0.68  0.555 0.435 0.575 0.27  0.425 0.21  0.435 0.24
 0.775 0.57  0.72  0.51  0.325 0.645 0.37  0.695 0.775 0.36  0.255 0.285
 0.32  0.485 0.245 0.545 0.685 0.815 0.56  0.765 0.24  0.73  0.495 0.405
 0.745 0.42  0.51  0.585 0.53  0.435 0.74  0.465 0.12  0.67  0.46  0.42
 0.71  0.445 0.79  0.465 0.665 0.65  0.34  0.49  0.69  0.16  0.49  0.475
 0.63  0.475 0.83  0.525 0.695 0.615 0.48  0.195 0.34  0.37  0.845 0.505
 0.55  0.59  0.75  0.59  0.61  0.585 0.42  0.8  ]
[0.56  0.595 0.26  0.56  0.6   0.39  0.62  0.4   0.43  0.245 0.48  0.145
 0.695 0.575 0.54  0.515 0.57  0.655 0.68  0.505 0.775 0.405 0.215 0.46
 0.365 0.61  0.52  0.735 0.435 0.69  0.525 0.555 0.375 0.525 0.4   0.41
 0.6   0.525 0.39  0.735 0.655 0.395 0.73  0.54  0.255 0.65  0.485 0.445
 0.79  0.4   0.72  0.415 0.36  0.75  0.405 0.485 0.49  0.31  0.45  0.535
 0.64  0.415 0.71  0.51  0.605 0.515 0.675 0.225 0.7   0.505 0.815 0.45
 0.45  0.515 0.765 0.7   0.585 0.66  0.455 0.67 ]
[0.675 0.5   0.425 0.455 

tensor([ 6.0645e-01,  5.2165e-01, -6.7051e-01,  2.7148e-01,  4.2988e-01,
        -2.0290e-01,  2.7061e-01, -8.2407e-01, -3.8650e-01, -1.2196e+00,
        -1.2718e-01, -1.0224e+00,  1.2037e+00,  2.4816e-01,  6.9981e-01,
        -1.9521e-01, -1.4524e-02,  4.3275e-01,  2.0208e-01,  5.2483e-01,
         1.3078e+00, -4.6213e-01, -1.1918e+00, -3.4004e-01, -5.3659e-01,
         4.8647e-01, -6.3090e-01,  7.9364e-01,  3.7083e-01,  9.3839e-01,
         2.5586e-01,  6.1617e-01, -7.9813e-01,  6.3944e-01, -3.9308e-01,
        -1.5560e-01,  7.1363e-01, -1.1428e-01,  4.1637e-02,  6.4628e-01,
         4.6744e-01, -3.5750e-01,  1.0117e+00, -1.2152e-01, -1.2805e+00,
         5.0257e-01, -8.6789e-02, -2.5096e-02,  1.3452e+00, -1.7543e-01,
         1.0796e+00, -8.7556e-02,  1.6758e-02,  8.7915e-01, -1.3563e-01,
         1.0877e-01,  3.2700e-01, -1.2712e+00, -2.5763e-01,  2.7233e-01,
         6.2045e-01,  2.2470e-02,  1.2022e+00, -4.6884e-05,  5.6575e-01,
         2.5652e-01,  2.4363e-01, -1.3166e+00,  1.8

In [None]:

if __name__ == "__main__":
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--n",     type=int,   required=True)
    # parser.add_argument("--index", type=float, required=True)
    # args = parser.parse_args()
    n=400
    index_e=0.8

######Constant Area#######
    # n        = args.n
    # index_e  = args.index
    r        = int(n ** index_e)
    B        = 1400            # or your B
    p        = 10           # or your p
    GLM_name = "Poisson"  # or your mode      
    folder = 'resultspart1'  
    xtest    = torch.load(f"{folder}/xtest10.pt")  # 预先生成并保存

    repeats = 100

    # Parallel 
    results = Parallel(n_jobs=20)(
        delayed(run_one_repeat)(i, n, r, B, p, GLM_name, f_1, xtest)
        for i in range(repeats)
    )

    # 解包并 stack
    Bf0_tensor = torch.stack([res[0] for res in results])  # [100, ntest]
    Bf1_tensor = torch.stack([res[1] for res in results])
    Bf2_tensor = torch.stack([res[2] for res in results])

    # 保存到 CSV
    df_bf0 = pd.DataFrame(Bf0_tensor.numpy())
    df_bf1 = pd.DataFrame(Bf1_tensor.numpy())
    df_bf2 = pd.DataFrame(Bf2_tensor.numpy())

    fn0 = f"{folder}/{GLM_name}fBf1n{n}p{p}B{B}r{r}.csv"
    fn1 = f"{folder}/{GLM_name}sdf1nn{n}p{p}B{B}r{r}.csv"
    fn2 = f"{folder}/{GLM_name}sdcrtf1nn{n}p{p}B{B}r{r}.csv"
    df_bf0.to_csv(fn0, index=False, header=False)
    df_bf1.to_csv(fn1, index=False, header=False)
    df_bf2.to_csv(fn2, index=False, header=False)

    print(f"Done n={n}, index={index_e}, saved {fn0}, {fn1},{fn2}")

