In [None]:
# New Features 


import os
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import InMemoryDataset, Data, DataLoader
from sklearn.neighbors import NearestNeighbors
import logging
import random
import warnings

CONFIG = {
    'data_path': r'C:\Users\amssa\Documents\Codes\New\Von-Neumann-Entropy-GNN\Random sub sets\spin_system_properties_gpu1-7.parquet',
    'processed_dir': './processed',
    'processed_file': './processed/data.pt',
    'batch_size': 1024,
    'random_seed': 42,
    'distance_threshold': 25
}

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[
            logging.StreamHandler()
        ]
    )

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

setup_logging()
set_seed(CONFIG['random_seed'])

class SpinSystemDataset(InMemoryDataset):
    def __init__(self, dataframe, root='.', transform=None, pre_transform=None):
        self.df = dataframe
        super(SpinSystemDataset, self).__init__(root, transform, pre_transform)
        if os.path.exists(self.processed_paths[0]):
            self.data, self.slices = torch.load(self.processed_paths[0])
        else:
            self.process()

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        pass

    def process(self):
        data_list = []
        for idx, row in self.df.iterrows():
            Nx = row['Nx']
            Ny = 2
            N = Nx * Ny

            x_spacing = row['x_spacing']
            y_spacing = row['y_spacing']
            positions = np.array([
                (col * x_spacing, row_idx * y_spacing)
                for row_idx in range(Nx) for col in range(Ny)
            ])
            positions = torch.tensor(positions, dtype=torch.float)

            # Normalize positions
            pos_min = positions.min(dim=0).values
            pos_max = positions.max(dim=0).values
            normalized_positions = (positions - pos_min) / (pos_max - pos_min + 1e-8)

            state_indices = row['Top_50_Indices']
            state_probs = row['Top_50_Probabilities']

            p_rydberg = [0.0] * N
            for state, prob in zip(state_indices, state_probs):
                state = int(state)
                for i in range(N):
                    if state & (1 << i):
                        p_rydberg[i] += prob
            p_rydberg = torch.tensor(p_rydberg, dtype=torch.float).unsqueeze(1)  # [N, 1]

            try:
                N_A_idx = int(row['N_A'])
            except ValueError:
                logging.error(f"Graph {idx}: N_A value '{row['N_A']}' is not an integer.")
                raise
            N_A_feature = torch.zeros((N, 1), dtype=torch.float)
            if 0 <= N_A_idx < N:
                N_A_feature[N_A_idx] = 1.0
            else:
                logging.warning(f'Graph {idx}: N_A index {N_A_idx} out of range for {N} nodes.')

            # Node features: normalized_positions (2), p_rydberg (1), N_A_feature (1)
            node_features = torch.cat([normalized_positions, p_rydberg, N_A_feature], dim=1)  # [N, 4]

            distance_threshold = CONFIG['distance_threshold']
            nbrs = NearestNeighbors(radius=distance_threshold, algorithm='ball_tree').fit(positions.numpy())
            indices = nbrs.radius_neighbors(positions.numpy(), return_distance=False)

            edge_index = []
            for i in range(N):
                for j in indices[i]:
                    if i < j:
                        edge_index.append([i, j])
            edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

            # Compute edge attributes (1/r^6)
            if edge_index.size(1) > 0:
                pos_i = positions[edge_index[0]]
                pos_j = positions[edge_index[1]]
                distances = torch.norm(pos_i - pos_j, dim=1, keepdim=True)
                epsilon = 1e-8
                inv_r6 = 1.0 / (distances.pow(6) + epsilon)  # [E, 1]
            else:
                distances = torch.empty((0, 1), dtype=torch.float)
                inv_r6 = torch.empty((0, 1), dtype=torch.float)

            # Compute joint probabilities on edges for correlation
            edge_tuples = [(edge_index[0, k].item(), edge_index[1, k].item()) for k in range(edge_index.size(1))]
            edge_joint_probs = {edge: 0.0 for edge in edge_tuples}

            for state, prob in zip(state_indices, state_probs):
                state = int(state)
                rydberg_particles = [i for i in range(N) if state & (1 << i)]
                for i_ in rydberg_particles:
                    for j_ in rydberg_particles:
                        if i_ < j_ and (i_, j_) in edge_joint_probs:
                            edge_joint_probs[(i_, j_)] += prob

            edge_correlation = []
            for k in range(edge_index.size(1)):
                i_ = edge_index[0, k].item()
                j_ = edge_index[1, k].item()
                joint_prob = edge_joint_probs.get((i_, j_), 0.0)
                p_ryd_i = p_rydberg[i_].item()
                p_ryd_j = p_rydberg[j_].item()
                correlation = joint_prob - p_ryd_i * p_ryd_j
                edge_correlation.append([correlation])

            edge_correlation = torch.tensor(edge_correlation, dtype=torch.float)  # [E, 1]

            if edge_index.size(1) > 0:
                edge_attr = torch.cat([inv_r6, edge_correlation], dim=1)  # [E, 2]
            else:
                edge_attr = torch.empty((0, 2), dtype=torch.float)

            # Von Neumann Entropy (target)
            original_entropy = row['Von_Neumann_Entropy']
            entropy = torch.tensor([np.log(original_entropy + 1e-9)], dtype=torch.float)

            # Derived features from experimental data
            # Compute P_rydberg statistics
            p_ryd_mean = p_rydberg.mean().item()
            p_ryd_std = p_rydberg.std().item() if p_rydberg.numel() > 1 else 0.0

            # Compute correlation stats
            if edge_correlation.numel() > 1:
                edge_corr_mean = edge_correlation.mean().item()
                edge_corr_std = edge_correlation.std().item()
            elif edge_correlation.numel() == 1:
                edge_corr_mean = edge_correlation.mean().item()
                edge_corr_std = 0.0
            else:
                edge_corr_mean = 0.0
                edge_corr_std = 0.0

            # Compute average node degree
            if edge_index.size(1) > 0:
                degrees = torch.bincount(torch.cat([edge_index[0], edge_index[1]]), minlength=N)
                avg_degree = degrees.float().mean().item()
            else:
                avg_degree = 0.0

            # Compute spatial metrics: mean and std of distances between connected nodes
            if distances.numel() > 1:
                pos_dist_mean = distances.mean().item()
                pos_dist_std = distances.std().item()
            elif distances.numel() == 1:
                pos_dist_mean = distances.mean().item()
                pos_dist_std = 0.0
            else:
                pos_dist_mean = 0.0
                pos_dist_std = 0.0

            # Nx, Ny are part of experimental setup (system size)
            global_features = torch.tensor([
                Nx, Ny,
                p_ryd_mean, p_ryd_std,
                edge_corr_mean, edge_corr_std,
                avg_degree,
                pos_dist_mean, pos_dist_std
            ], dtype=torch.float).unsqueeze(0)  # [1, 9]

            data = Data(
                x=node_features, 
                edge_index=edge_index, 
                edge_attr=edge_attr, 
                y=entropy
            )
            data.global_features = global_features

            data_list.append(data)

        if self.pre_transform:
            data_list = [self.pre_transform(d) for d in data_list]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

def load_data(config):
    warnings.filterwarnings("ignore", category=FutureWarning, module="torch")
    if not os.path.exists(config['data_path']):
        logging.error(f"Data file not found at {config['data_path']}")
        raise FileNotFoundError(f"Data file not found at {config['data_path']}")

    df = pd.read_parquet(config['data_path'])

    logging.info("First few rows of the dataset:")
    logging.info(df.head())
    logging.info("\nDataset Information:")
    logging.info(df.info())

    df_shuffled = df.sample(frac=1, random_state=config['random_seed']).reset_index(drop=True)
    dataset = SpinSystemDataset(df_shuffled, root=config['processed_dir'])

    logging.info(f'\nTotal graphs in dataset: {len(dataset)}')
    logging.info(f'\nSample Data Object:')
    logging.info(dataset[0])

    return dataset

def split_dataset(dataset, config):
    total = len(dataset)
    train_end = int(0.8 * total)
    val_end = int(0.9 * total)

    train_dataset = dataset[:train_end]
    val_dataset = dataset[train_end:val_end]
    test_dataset = dataset[val_end:]

    logging.info(f'\nTraining graphs: {len(train_dataset)}')
    logging.info(f'Validation graphs: {len(val_dataset)}')
    logging.info(f'Test graphs: {len(test_dataset)}')
    return train_dataset, val_dataset, test_dataset

def create_dataloaders(train_dataset, val_dataset, test_dataset, config):
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)
    return train_loader, val_loader, test_loader

def main():
    dataset = load_data(CONFIG)
    train_dataset, val_dataset, test_dataset = split_dataset(dataset, CONFIG)
    train_loader, val_loader, test_loader = create_dataloaders(train_dataset, val_dataset, test_dataset, CONFIG)

    logging.info("Data processing and loading completed successfully.")

if __name__ == "__main__":
    main()


2024-12-24 14:32:05,686 [INFO] First few rows of the dataset:
2024-12-24 14:32:05,700 [INFO]    Nx      Delta      Omega  x_spacing  y_spacing      Energy  \
0   7  40.030606   9.988305   7.118764   6.387401 -239.252283   
1   7  23.347429  39.473622   4.798695   6.056938 -221.844105   
2   7   9.494868  20.161970   4.728944   7.021446 -105.435782   
3   7   5.336133  10.603160   4.180909   5.301321  -44.204418   
4   7  16.238134  16.933983   6.918425   6.550230 -122.907435   

                                      Top_50_Indices  \
0  [6553, 9830, 6425, 9766, 6297, 6537, 9798, 931...   
1  [0, 1, 8192, 4096, 2, 4097, 8194, 8193, 4098, ...   
2  [0, 8192, 2, 1, 4096, 8194, 4097, 8193, 4098, ...   
3  [0, 1, 4096, 8192, 2, 8193, 4098, 4097, 8194, ...   
4  [8738, 4369, 8802, 4497, 9762, 8742, 4377, 641...   

                                Top_50_Probabilities  Von_Neumann_Entropy  \
0  [0.38220482824683005, 0.38220477448728546, 0.0...             0.666110   
1  [0.007107020650037715,

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500000 entries, 0 to 1499999
Data columns (total 11 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Nx                    1500000 non-null  int64  
 1   Delta                 1500000 non-null  float64
 2   Omega                 1500000 non-null  float64
 3   x_spacing             1500000 non-null  float64
 4   y_spacing             1500000 non-null  float64
 5   Energy                1500000 non-null  float64
 6   Top_50_Indices        1500000 non-null  object 
 7   Top_50_Probabilities  1500000 non-null  object 
 8   Von_Neumann_Entropy   1500000 non-null  float64
 9   N_A                   1500000 non-null  int64  
 10  Subsystem_Mask        1500000 non-null  object 
dtypes: float64(6), int64(2), object(3)
memory usage: 125.9+ MB


Processing...
