In [4]:
import os
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import InMemoryDataset, Data, DataLoader
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import logging
import random
import warnings
import joblib  


CONFIG = {
    'data_path': r'C:\Users\amssa\Documents\Codes\1-6\data1-6.parquet',  # Original data file
    'processed_dir': './processed',
    'processed_file': './processed/data.pt',
    'batch_size': 1024,
    'random_seed': 42,
    'distance_threshold': 25,  
    'scalers_path': 'scalers.pkl',  
}


def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[
            logging.StreamHandler()
        ]
    )

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

setup_logging()
set_seed(CONFIG['random_seed'])


class SpinSystemDataset(InMemoryDataset):
    def __init__(self, dataframe, root='.', transform=None, pre_transform=None):
        self.df = dataframe
        super(SpinSystemDataset, self).__init__(root, transform, pre_transform)
        if os.path.exists(self.processed_paths[0]):
            self.data, self.slices = torch.load(self.processed_paths[0])
        else:
            self.process()

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        pass

    def process(self):

        data_list = []
        for idx, row in self.df.iterrows():
            # Extract parameters
            Nx = row['Nx']
            Ny = 2  # As defined in your data generation
            N = Nx * Ny  # Total number of spins

            # Node Features: Position (x, y), p_rydberg
            x_spacing = row['x_spacing']
            y_spacing = row['y_spacing']
            positions = np.array([
                (col * x_spacing, row_idx * y_spacing)
                for row_idx in range(Nx) for col in range(Ny)
            ])
            positions = torch.tensor(positions, dtype=torch.float)

            # Normalize positions
            pos_min = positions.min(dim=0).values
            pos_max = positions.max(dim=0).values
            normalized_positions = (positions - pos_min) / (pos_max - pos_min + 1e-8)

            # Extracting 'Top_50_Indices' and 'Top_50_Probabilities'
            state_indices = row['Top_50_Indices'] 
            state_probs = row['Top_50_Probabilities']  

            num_particles = N  

            p_rydberg = [0.0] * num_particles

            for state, prob in zip(state_indices, state_probs):
                state = int(state)
                for i in range(num_particles):
                    if state & (1 << i):
                        p_rydberg[i] += prob

            p_rydberg = torch.tensor(p_rydberg, dtype=torch.float).unsqueeze(1)  # [N, 1]

            try:
                N_A_idx = int(row['N_A'])  
            except ValueError:
                logging.error(f"Graph {idx}: N_A value '{row['N_A']}' is not an integer.")
                raise

            N_A_feature = torch.zeros((N, 1), dtype=torch.float)
            if 0 <= N_A_idx < N:
                N_A_feature[N_A_idx] = 1.0
            else:
                logging.warning(f'Graph {idx}: N_A index {N_A_idx} out of range for {N} nodes.')

            # Concatenate node features: [x, y, p_rydberg, N_A]
            node_features = torch.cat([normalized_positions, p_rydberg, N_A_feature], dim=1)  # [N, 4]

            # Edge Index and Edge Attributes using Distance Threshold
            distance_threshold = CONFIG['distance_threshold']
            nbrs = NearestNeighbors(radius=distance_threshold, algorithm='ball_tree').fit(positions.numpy())
            indices = nbrs.radius_neighbors(positions.numpy(), return_distance=False)

            edge_index = []
            for i in range(N):
                for j in indices[i]:
                    if i < j:
                        edge_index.append([i, j])
            edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

            logging.debug(f'Graph {idx}: Created {edge_index.size(1)} edges using distance threshold.')

            # Compute edge attributes (1/r^6)
            if edge_index.size(1) > 0:
                pos_i = positions[edge_index[0]]
                pos_j = positions[edge_index[1]]
                distances = torch.norm(pos_i - pos_j, dim=1, keepdim=True)
                epsilon = 1e-8
                inv_r6 = 1.0 / (distances.pow(6) + epsilon)  # [E, 1]
            else:
                inv_r6 = torch.empty((0, 1), dtype=torch.float)

            # Compute joint probabilities for edges
            edge_tuples = [(edge_index[0, k].item(), edge_index[1, k].item()) for k in range(edge_index.size(1))]
            edge_joint_probs = {edge: 0.0 for edge in edge_tuples}

            for state, prob in zip(state_indices, state_probs):
                state = int(state)
                rydberg_particles = [i for i in range(N) if state & (1 << i)]
                rydberg_set = set(rydberg_particles)
                for i in rydberg_particles:
                    for j in rydberg_particles:
                        if i < j and (i, j) in edge_joint_probs:
                            edge_joint_probs[(i, j)] += prob

            # Compute correlation function for edges
            edge_correlation = []
            for k in range(edge_index.size(1)):
                i = edge_index[0, k].item()
                j = edge_index[1, k].item()
                joint_prob = edge_joint_probs.get((i, j), 0.0)
                p_ryd_i = p_rydberg[i].item()
                p_ryd_j = p_rydberg[j].item()
                correlation = joint_prob - p_ryd_i * p_ryd_j
                edge_correlation.append([correlation])

            edge_correlation = torch.tensor(edge_correlation, dtype=torch.float)  # [E, 1]

            if edge_index.size(1) > 0:
                edge_attr = torch.cat([inv_r6, edge_correlation], dim=1)  # [E, 2]
            else:
                edge_attr = torch.empty((0, 2), dtype=torch.float)

            # Target: Von Neumann Entropy
            entropy = torch.tensor([row['Von_Neumann_Entropy']], dtype=torch.float)
            entropy = torch.log(entropy + 1e-9)  

            # Graph-level features: Omega, Delta, Energy
            Omega = torch.tensor([[row['Omega']]], dtype=torch.float)       # [1, 1]
            Delta = torch.tensor([[row['Delta']]], dtype=torch.float)       # [1, 1]
            Energy = torch.tensor([[row['Energy']]], dtype=torch.float)     # [1, 1]

            # Create Data object
            data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=entropy)
            data.Omega = Omega
            data.Delta = Delta
            data.Energy = Energy

            data_list.append(data)

        
        if self.pre_transform:
            data_list = [self.pre_transform(d) for d in data_list]

        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])


def load_data(config):
    
    warnings.filterwarnings("ignore", category=FutureWarning, module="torch")

    
    if not os.path.exists(config['data_path']):
        logging.error(f"Data file not found at {config['data_path']}")
        raise FileNotFoundError(f"Data file not found at {config['data_path']}")

    df = pd.read_parquet(config['data_path'])

    
    logging.info("First few rows of the dataset:")
    logging.info(df.head())
    logging.info("\nDataset Information:")
    logging.info(df.info())

    
    df_shuffled = df.sample(frac=1, random_state=config['random_seed']).reset_index(drop=True)

    
    dataset = SpinSystemDataset(df_shuffled, root=config['processed_dir'])

    logging.info(f'\nTotal graphs in dataset: {len(dataset)}')
    logging.info(f'\nSample Data Object:')
    logging.info(dataset[0])

    return dataset

def split_dataset(dataset, config):
    
    total = len(dataset)
    train_end = int(0.8 * total)
    val_end = int(0.9 * total)

    
    train_dataset = dataset[:train_end]
    val_dataset = dataset[train_end:val_end]
    test_dataset = dataset[val_end:]

    logging.info(f'\nTraining graphs: {len(train_dataset)}')
    logging.info(f'Validation graphs: {len(val_dataset)}')
    logging.info(f'Test graphs: {len(test_dataset)}')

    return train_dataset, val_dataset, test_dataset

def normalize_features(train_dataset, val_dataset, test_dataset, scalers_path):
    # Initialize scalers
    scalers = {
        'Omega': StandardScaler(),
        'Delta': StandardScaler(),
        'Energy': StandardScaler()
    }

    # Fit scalers on training data
    features = ['Omega', 'Delta', 'Energy']
    for feature in features:
        try:
            train_values = torch.cat([getattr(data, feature) for data in train_dataset], dim=0).numpy()
            scalers[feature].fit(train_values)
        except AttributeError as e:
            logging.error(f"AttributeError while accessing feature '{feature}': {e}")
            raise

    
    joblib.dump(scalers, scalers_path)
    logging.info(f"Scalers saved to '{scalers_path}'.")

    
    def normalize(dataset, scalers):
        for data in dataset:
            for feature in features:
                scaled = scalers[feature].transform(getattr(data, feature).numpy())
                setattr(data, feature, torch.tensor(scaled, dtype=torch.float))
        return dataset

    # Apply normalization
    train_dataset = normalize(train_dataset, scalers)
    val_dataset = normalize(val_dataset, scalers)
    test_dataset = normalize(test_dataset, scalers)

    return train_dataset, val_dataset, test_dataset

def create_dataloaders(train_dataset, val_dataset, test_dataset, config):
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)
    return train_loader, val_loader, test_loader


def main():
    # Load and prepare dataset
    dataset = load_data(CONFIG)
    train_dataset, val_dataset, test_dataset = split_dataset(dataset, CONFIG)
    train_dataset, val_dataset, test_dataset = normalize_features(train_dataset, val_dataset, test_dataset, CONFIG['scalers_path'])
    train_loader, val_loader, test_loader = create_dataloaders(train_dataset, val_dataset, test_dataset, CONFIG)

    logging.info("Data processing and loading completed successfully.")


if __name__ == "__main__":
    main()


2024-12-06 17:43:37,897 [INFO] First few rows of the dataset:
2024-12-06 17:43:37,901 [INFO]    Nx      Delta      Omega  x_spacing  y_spacing      Energy  \
0   5  40.030606   9.988305   7.118764   6.387401 -173.629339   
1   5  49.178313  23.871382   7.439762   6.721230 -254.139080   
2   5   4.187528  47.495391   7.862528   7.233589 -196.238397   
3   5   1.782346  12.313797   4.964102   6.733054  -36.205582   
4   5  48.509647  38.981508   7.757996   7.579309 -356.018109   

                                      Top_50_Indices  \
0  [409, 614, 393, 582, 281, 401, 550, 610, 408, ...   
1  [409, 614, 403, 793, 611, 806, 401, 281, 610, ...   
2  [0, 2, 1, 256, 512, 32, 16, 257, 514, 258, 513...   
3  [0, 2, 512, 1, 256, 32, 16, 64, 128, 4, 8, 514...   
4  [871, 923, 819, 870, 615, 411, 921, 614, 409, ...   

                                Top_50_Probabilities  Von_Neumann_Entropy  N_A  
0  [0.4158781310204766, 0.4158781307441062, 0.017...             0.694156    5  
1  [0.15122562634

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 829318 entries, 0 to 829317
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Nx                    829318 non-null  int64  
 1   Delta                 829318 non-null  float64
 2   Omega                 829318 non-null  float64
 3   x_spacing             829318 non-null  float64
 4   y_spacing             829318 non-null  float64
 5   Energy                829318 non-null  float64
 6   Top_50_Indices        829318 non-null  object 
 7   Top_50_Probabilities  829318 non-null  object 
 8   Von_Neumann_Entropy   829318 non-null  float64
 9   N_A                   829318 non-null  int64  
dtypes: float64(6), int64(2), object(2)
memory usage: 63.3+ MB


Processing...
Done!
  self.data, self.slices = torch.load(self.processed_paths[0])
2024-12-06 18:16:19,166 [INFO] 
Total graphs in dataset: 829318
2024-12-06 18:16:19,167 [INFO] 
Sample Data Object:
2024-12-06 18:16:19,170 [INFO] Data(x=[8, 4], edge_index=[2, 28], edge_attr=[28, 2], y=[1], Omega=[1, 1], Delta=[1, 1], Energy=[1, 1])
2024-12-06 18:16:19,214 [INFO] 
Training graphs: 663454
2024-12-06 18:16:19,215 [INFO] Validation graphs: 82932
2024-12-06 18:16:19,216 [INFO] Test graphs: 82932
2024-12-06 18:18:18,769 [INFO] Scalers saved to 'scalers.pkl'.
2024-12-06 18:22:54,571 [INFO] Data processing and loading completed successfully.
