In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import os

import shutil
from collections import Counter
import numpy as np
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt



class SampleSet:
    def __init__(self, index,X, Y,id,module='Bernoulli'):
        """
        Initializes the SampleSet with n samples and p features for X.
        Y is generated based on the conditional probability P(Y=1|X).
        """
        self.index=index
        self.n=X.shape[0]
        self.p=X.shape[1]
        self.X = X
        self.Y = Y
        self.id=id
        self.subtrain=None
        self.subval=None
        self.counts=None
        self.r=int(self.n**index)

    

    def get_sample_set(self):
        """Returns the main sample set (X, Y)."""
        return self.X, self.Y
    
    def get_sub_samples_with_validation(self, B):
        """
        Generates B sub-sample sets, each containing r samples randomly selected 
        from the main sample set, along with corresponding validation sets.
        
        Also counts the number of times each index is selected across all B sub-samples.
        
        Returns:
            train_samples: List of tuples, each containing (train_X, train_Y, train_indices)
            validation_samples: List of tuples, each containing (val_X, val_Y, val_indices)
            selection_counts: Dictionary with counts of each index's appearance in the B sub-samples.
        """
        train_samples = []
        validation_samples = []
        selection_counts = Counter()  # To track appearances of each index
        indices = torch.arange(self.n)
        self.B=B
        for _ in range(B):
            # Randomly select r unique indices for the sub-sample
            selected_indices = indices[torch.randperm(self.n)[:self.r]]
            
            # Update selection count for each index
            selection_counts.update(selected_indices.tolist())
            
            # Get validation indices (those not in selected_indices)
            val_indices = torch.tensor([i for i in indices if i not in selected_indices])

            # Separate sub-sample and validation sets, including original indices
            X_sub = self.X[selected_indices]
            Y_sub = self.Y[selected_indices]
            X_val = self.X[val_indices]
            Y_val = self.Y[val_indices]
            
            # Append to train_samples and validation_samples lists
            train_samples.append((X_sub, Y_sub, selected_indices))
            validation_samples.append((X_val, Y_val, val_indices))
        self.subtrain=train_samples
        self.subval=validation_samples
        self.counts=dict(selection_counts)
        return train_samples, validation_samples, dict(selection_counts)
    
    def save(self, file_path):
        """Saves the SampleSet instance to a file."""
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        torch.save(self, file_path)

    @staticmethod
    def load(file_path):
        """Loads a SampleSet instance from a file."""
        return torch.load(file_path,weights_only=False)


df = pd.read_csv("final_transformed_data_188.csv")
patient_ids = df["patientunitstayid"]
# Create binary target Y
df["Y"] = (df["unitvisitnumber"] ).astype(float)

# Separate features (X) and target (y)
df=df.drop(columns=["patientunitstayid"])
X = df.drop(["unitvisitnumber", "Y"], axis=1)
y = df["Y"].values-1


In [2]:
from sklearn.model_selection import StratifiedKFold

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# To store the splits
splits = []

# Perform 5-fold cross-validation
for train_idx, test_idx in skf.split(X, y):
    # Split data into training and testing sets
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    id_train, id_test = patient_ids[train_idx], patient_ids[test_idx]
    
    # Append the split to the list
    splits.append((X_train, X_test, y_train, y_test, id_train, id_test))

# Now `splits` contains 5 tuples, each with (X_train, X_test, y_train, y_test, id_train, id_test)





# Column indices (0-based)
B=3000
X_train=splits[0][0]
X_test=splits[0][1]
y_train=splits[0][2]
y_test=splits[0][3]
id_train=splits[0][4]
id_test=splits[0][5]

sample_set = SampleSet(0.9,X_train,y_train,id_train,module='Bernoulli')

continuous_cols = [0] + list(range(7,13 ))+list(range(26,28 ))

categorical_cols=list(range(37, 38))
p=len(continuous_cols)+len(categorical_cols)

# Loop through all 5 folds
for i, (X_train, X_test, y_train, y_test, id_train, id_test) in enumerate(splits):
    # Split features
    X_train_cont = X_train.iloc[:, continuous_cols]
    X_train_cat = X_train.iloc[:, categorical_cols]

    X_test_cont = X_test.iloc[:, continuous_cols]
    X_test_cat = X_test.iloc[:, categorical_cols]

    # Initialize scaler using training data
    scaler = StandardScaler()
    X_train_cont_scaled = scaler.fit_transform(X_train_cont)

    # Apply same transformation to test data
    X_test_cont_scaled = scaler.transform(X_test_cont)

    # Combine scaled continuous + original categorical features
    X_train_processed = np.hstack([X_train_cont_scaled, X_train_cat])
    X_test_processed = np.hstack([X_test_cont_scaled, X_test_cat])

    # Convert to float32 tensors
    X_train_tensor = torch.tensor(X_train_processed, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_processed, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    # Create and save training SampleSet
    sample_set_train = SampleSet(0.9, X_train_tensor, y_train_tensor, id_train, module='Bernoulli')
    sample_set_train.get_sub_samples_with_validation(B)
    sample_set_train.save(f'sampleset/188/sampleset{p}poisson{B}_trainfolder{i}.pth')

    # Create and save testing SampleSet
    sample_set_test = SampleSet(0.9, X_test_tensor, y_test_tensor, id_test, module='Bernoulli')
    sample_set_test.save(f'sampleset/188/sampleset{p}poisson{B}_testfolder{i}.pth')





