In [None]:
import os
import pandas as pd
import numpy as np
import glob
import torch
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchvision import transforms, utils, transforms
import matplotlib.pyplot as plt

In [None]:
plt.rcdefaults()

In [None]:
# Plotting function

def plot_spectrogram(spec, title, time, frequency, start_longest_mode=None, end_longest_mode=None):
    fig, ax = plt.subplots()
    ax.imshow(spec.T, extent=(time[0], time[-1], frequency[0], frequency[-1]), aspect='auto', cmap='jet',
              origin='lower')
    ax.set_xlim(time[0], time[-1])
    ax.set_ylim(frequency[0], frequency[-1])
    ax.set_xlabel("Time [s]")
    ax.set_ylabel("Frequency [Hz]")
    fig.set_dpi(150)
    plt.title(title)

    # Plot vertical lines if start and end times are provided
    if start_longest_mode is not None and end_longest_mode is not None:
        ax.axvline(x=start_longest_mode, color='blue', linestyle='--', label='Start of Longest Mode')
        ax.axvline(x=end_longest_mode, color='darkblue', linestyle='--', label='End of Longest Mode')
        plt.legend(loc='upper left')

    plt.show()

In [None]:
DATA_PATH = "data/dataset_pickle"
FILE_EXT = "pickle"

def load_shot(shotno, data_path, file_ext):
    file_path = os.path.join(data_path, f"{shotno}.{file_ext}")
    return pd.read_pickle(file_path)

class SpectrogramDataset(Dataset):
    def __init__(self, data_path, file_ext, window_size, transform=None, stride = 0):
        # data loading
        self.data_path = data_path
        self.file_ext = file_ext
        self.window_size = window_size
        self.transform = transform
        self.stride = stride

        # Obtain all shot numbers
        self.data_files = [int(os.path.basename(x.split(f".{file_ext}")[0]))
                          for x in glob.glob(os.path.join(data_path, f"*.{file_ext}"))]

    def __len__(self):
        return len(self.data_files) # Returns the # of files
    
    """
    The get method describes how a single experiment is handeled, whereas the dataloader will make sure
    that the batching is done correctly!
    
    """

    def __getitem__(self, idx):
        shotno = self.data_files[idx]

        # Load data for the experiment
        data_shot = load_shot(shotno, self.data_path, self.file_ext)

        # Extract inputs
        spec_even = data_shot["x"]["spectrogram"]["EvenN"]
        spec_odd = data_shot["x"]["spectrogram"]["OddN"]
        frequency = data_shot["x"]["spectrogram"]["frequency"]
        time = data_shot["x"]["spectrogram"]["time"]

        # Calculate the number of windows based on window size
        num_windows = int(np.floor((spec_even.shape[0] - self.window_size) / (self.window_size * (1 - self.overlap_factor)))) 
        print(len(t), num_windows)

        # Extract windows along with their start and end indices
        windows = []
        for i in range(num_windows):
            start_idx = int(i * self.window_size * (1 - self.overlap_factor))
            end_idx = start_idx + self.window_size

            window_even = spec_even[:, start_idx:end_idx]
            window_odd = spec_odd[:, start_idx:end_idx]

            if self.transform:
                window_even = self.transform(window_even)
                window_odd = self.transform(window_odd)

            windows.append({
                'window_even': window_even,
                'window_odd': window_odd,
                'frequency': f,
                'time': t[start_idx:end_idx],
                'start_idx': start_idx,
                'end_idx': end_idx,
                'shotno': shotno
            })

        return windows
    
    

# Example usage:
# Define hyperparameters
DATA_PATH = "data/dataset_pickle"
FILE_EXT = "pickle"
window_size = 64  # Length of the window
batch_size = 32 # Number of experiments (shots) to include in each batch
stride = 0.5 # Factor that determines the overlap when selecting sliding windows

# Create dataset and dataloader
#transform = transforms.Compose([
    # Add any additional transformations you need
#])

dataset = SpectrogramDataset(data_path = DATA_PATH, file_ext = FILE_EXT, window_size = window_size, transform=None, stride = stride)
dataloader = DataLoader(dataset = dataset, batch_size=batch_size, shuffle=True, num_workers=0)

In [None]:
class SpectrogramDataset(Dataset):
    def __init__(self, data_path, file_ext, window_size, transform=None, overlap_factor=0):
        self.data_path = data_path
        self.file_ext = file_ext
        self.window_size = window_size
        self.transform = transform
        self.overlap_factor = overlap_factor

        self.all_shots = [int(os.path.basename(x.split(f".{file_ext}")[0]))
                          for x in glob.glob(os.path.join(data_path, f"*.{file_ext}"))]

    def __len__(self):
        return sum(len(self.get_windows(shotno)) for shotno in self.all_shots)

    def get_windows(self, shotno):
        data_shot = load_shot(shotno, self.data_path, self.file_ext)

        inputs = data_shot["x"]["spectrogram"]
        spec_even = inputs["EvenN"]
        spec_odd = inputs["OddN"]
        f = inputs["frequency"]
        t = inputs["time"]

        num_windows = int(np.floor(spec_even.shape[1] / (self.window_size * (1 - self.overlap_factor))))

        windows = []
        for i in range(num_windows):
            start_idx = int(i * self.window_size * (1 - self.overlap_factor))
            end_idx = start_idx + self.window_size

            window_even = spec_even[:, start_idx:end_idx]
            window_odd = spec_odd[:, start_idx:end_idx]

            if self.transform:
                window_even = self.transform(window_even)
                window_odd = self.transform(window_odd)

            windows.append({
                'window_even': window_even,
                'window_odd': window_odd,
                'frequency': f,
                'time': t[start_idx:end_idx],
                'start_idx': start_idx,
                'end_idx': end_idx,
                'shotno': shotno
            })

        return windows

    def __getitem__(self, idx):
        current_idx = 0
        for shotno in self.all_shots:
            windows = self.get_windows(shotno)
            if current_idx + len(windows) > idx:
                return windows[idx - current_idx]
            current_idx += len(windows)

# Example usage
window_size = 64
overlap_factor = 0.5
batch_size = 1  # Use a batch size of 1

dataset = SpectrogramDataset(DATA_PATH, FILE_EXT, window_size, overlap_factor=overlap_factor)
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0)



for sample in dataset:
    print(sample['start_idx'], sample['end_idx'])
    print(len(sample['time']))
    print(sample['shotno'])
    print(sample['window_even'])
    
    
#for idx in range(len(dataset)):
#    sample = dataset[idx]
#    print(sample['window_even'].shape)
#    print(sample['start_idx'], sample['end_idx'])
#    print(sample['time'])
#    print(sample['shotno'])

In [None]:
all_shots = [int(os.path.basename(x.split(f".{FILE_EXT}")[0]))
             for x in glob.glob(os.path.join(DATA_PATH, f"*.{FILE_EXT}"))]

In [None]:
DATA_PATH = "data/dataset_pickle"
FILE_EXT = "pickle"

def load_shot(shotno, data_path, file_ext):
    file_path = os.path.join(data_path, f"{shotno}.{file_ext}")
    return pd.read_pickle(file_path)

class SpectrogramDataset(Dataset):
    def __init__(self, data_path, file_ext, window_size, transform=None, sampler = None):
        # data loading
        self.data_path = data_path
        self.file_ext = file_ext
        self.window_size = window_size
        self.transform = transform
        #self.stride = stride

        # Obtain all shot numbers
        self.data_files = [int(os.path.basename(x.split(f".{file_ext}")[0]))
                          for x in glob.glob(os.path.join(data_path, f"*.{file_ext}"))]
        # Use custom sampler if provided
        self.sampler = sampler

    def __len__(self):
        return len(self.data_files) # Returns the # of files
    
    """
    The get method describes how a single experiment is handeled, whereas the dataloader will make sure
    that the batching is done correctly!
    
    """

    def __getitem__(self, idx):
        shotno = self.data_files[idx]

        # Load data for the experiment
        data_shot = load_shot(shotno, self.data_path, self.file_ext)

        # Extract inputs
        spec_even = torch.tensor(data_shot["x"]["spectrogram"]["EvenN"], dtype=torch.float32).T # --> [frequency, time]
        spec_odd = torch.tensor(data_shot["x"]["spectrogram"]["OddN"], dtype=torch.float32).T
        frequency = data_shot["x"]["spectrogram"]["frequency"]
        time = data_shot["x"]["spectrogram"]["time"]
        
        num_windows = len(time) // self.window_size

        windows = []
        for i in range(num_windows):
            start_idx = i*window_size
            end_idx = start_idx + self.window_size

            window_even = spec_even[:, start_idx:end_idx]
            window_odd = spec_odd[:, start_idx:end_idx]
            print(window_even.shape, window_odd.shape)
            
            if self.transform:
                window_even = self.transform(window_even)
                window_odd = self.transform(window_odd)

            windows.append({
                'window_even': window_even,
                'window_odd': window_odd,
                'frequency': frequency,
                'time': time[start_idx:end_idx],
                'start_idx': start_idx,
                'end_idx': end_idx,
                'shotno': shotno
            })

        return windows
    
    
# Example usage
DATA_PATH = "data/dataset_pickle"
FILE_EXT = "pickle"
WINDOW_SIZE = 64  # Adjust as needed
#STRIDE = 32  # Adjust as needed

custom_sampler = SubsetRandomSampler(all_shots)

dataset = SpectrogramDataset(data_path = DATA_PATH, file_ext = FILE_EXT, window_size = WINDOW_SIZE, sampler = None)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Iterate through the DataLoader
for idx, (batch_even, batch_odd) in enumerate(dataloader):
    # Your training/inference code here
    print(f"Batch {idx + 1} - EvenN shape: {len(batch_even)} windows")
    print(f"Batch {idx + 1} - OddN shape: {len(batch_odd)} windows")

In [5]:
def __getitem__(self, idx):
    shotno = self.data_files[idx]

    # Load data for the experiment
    data_shot = load_shot(shotno, self.data_path, self.file_ext)

    # Extract inputs
    spec_even = torch.tensor(data_shot["x"]["spectrogram"]["EvenN"], dtype=torch.float32).T  # --> [frequency, time]
    spec_odd = torch.tensor(data_shot["x"]["spectrogram"]["OddN"], dtype=torch.float32).T
    frequency = data_shot["x"]["spectrogram"]["frequency"]
    time = data_shot["x"]["spectrogram"]["time"]

    num_windows = len(time) // self.window_size

    # Print experiment and window information for debugging
    print(f"Experiment: {shotno}, Total Windows: {num_windows}, Window Size: {self.window_size}")

    # Apply non-overlapping sliding window for EvenN
    windows_even = []
    for i in range(0, num_windows * self.window_size, self.window_size):
        start_idx = i
        end_idx = i + self.window_size

        slice_data = spec_even[start_idx:end_idx, :]
        print(f"EvenN - Window Size: {slice_data.shape}")

        windows_even.append({
            'window_even': slice_data,
            'frequency': frequency,
            'time': time[start_idx:end_idx],
            'start_idx': start_idx,
            'end_idx': end_idx,
            'shotno': shotno
        })

    # Apply non-overlapping sliding window for OddN
    windows_odd = []
    for i in range(0, num_windows * self.window_size, self.window_size):
        start_idx = i
        end_idx = i + self.window_size

        slice_data = spec_odd[start_idx:end_idx, :]
        print(f"OddN - Window Size: {slice_data.shape}")

        windows_odd.append({
            'window_odd': slice_data,
            'frequency': frequency,
            'time': time[start_idx:end_idx],
            'start_idx': start_idx,
            'end_idx': end_idx,
            'shotno': shotno
        })

    return windows_even, windows_odd


# Example usage
DATA_PATH = "data/dataset_pickle"
FILE_EXT = "pickle"
WINDOW_SIZE = 64  # Adjust as needed

all_shots = [int(os.path.basename(x.split(f".{FILE_EXT}")[0]))
             for x in glob.glob(os.path.join(DATA_PATH, f"*.{FILE_EXT}"))]
# Set batch_size to the desired number of windows per batch
batch_size = 32

# Create the DataLoader with RandomSampler and BatchSampler
dataset = SpectrogramDataset(data_path=DATA_PATH, file_ext=FILE_EXT, window_size=WINDOW_SIZE)
sampler = RandomSampler(dataset)
batch_sampler = BatchSampler(sampler, batch_size=batch_size, drop_last=True)  # Set drop_last to True if you want to drop the last batch if it's smaller than batch_size
dataloader = DataLoader(dataset, batch_sampler=batch_sampler)

# Iterate through the DataLoader
for idx, (batch_even, batch_odd) in enumerate(dataloader):
    # Your training/inference code here
    print(f"Batch {idx + 1} - EvenN shape: {len(batch_even)} windows")
    print(f"Batch {idx + 1} - OddN shape: {len(batch_odd)} windows")


RuntimeError: each element in list of batch should be of equal size

In [8]:
# Iterate through the DataLoader
for idx, (batch_even, batch_odd) in enumerate(dataloader):
    try:
        # Your training/inference code here
        
        # Print batch information
        print(f"Batch {idx + 1} - Number of Windows (EvenN): {len(batch_even)}")
        print(f"Batch {idx + 1} - Number of Windows (OddN): {len(batch_odd)}")

        # Print sizes of individual windows in the batch
        for window_info in batch_even:
            print(f"EvenN - Window Size: {window_info['window_even'].shape}")

        for window_info in batch_odd:
            print(f"OddN - Window Size: {window_info['window_odd'].shape}")

    except Exception as e:
        print(f"Error in batch {idx + 1}: {e}")


RuntimeError: each element in list of batch should be of equal size

In [6]:
import os
import glob
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# define a custom dataset class that returns sliding windows of spectrograms
class SpectrogramDataset(Dataset):
    def __init__(self, data_path, file_ext, window_size, step_size):
        self.data_path = data_path
        self.file_ext = file_ext
        self.window_size = window_size
        self.step_size = step_size
        # get all shot numbers
        self.all_shots = [int(os.path.basename(x.split(f".{file_ext}")[0]))
                          for x in glob.glob(os.path.join(data_path, f"*.{file_ext}"))]
        # compute the total number of windows
        self.num_windows = 0
        for shotno in self.all_shots:
            spectrogram = self.load_shot(shotno)
            self.num_windows += (len(spectrogram) - window_size) // step_size + 1

    def load_shot(self, shotno):
        # load a pickle file containing a spectrogram
        file_path = os.path.join(self.data_path, f"{shotno}.{self.file_ext}")
        return pd.read_pickle(file_path)

    def __getitem__(self, index):
        # get the sliding window of spectrogram at the given index
        # first, find the corresponding shot number and the offset within the shot
        shot_index = 0
        window_index = index
        while window_index >= (len(self.load_shot(self.all_shots[shot_index])) - self.window_size) // self.step_size + 1:
            window_index -= (len(self.load_shot(self.all_shots[shot_index])) - self.window_size) // self.step_size + 1
            shot_index += 1
        shotno = self.all_shots[shot_index]
        # second, load the spectrogram and slice the window
        spectrogram = self.load_shot(shotno)
        start = window_index * self.step_size
        end = start + self.window_size
        window = spectrogram[start:end]
        # third, convert the window to a torch tensor
        window = torch.from_numpy(window)
        return window

    def __len__(self):
        # return the total number of windows
        return self.num_windows

# define the data path, file extension, window size, and step size
DATA_PATH = "data/dataset_pickle"
FILE_EXT = "pickle"
WINDOW_SIZE = 32
STEP_SIZE = 16

# create an instance of the dataset
dataset = SpectrogramDataset(DATA_PATH, FILE_EXT, WINDOW_SIZE, STEP_SIZE)

# create a dataloader with a batch size of 64
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=4)

# iterate through the dataloader and print the shape of each batch
for batch in dataloader:
    print(batch.shape)


ValueError: __len__() should return >= 0

## Number of windows per batch

In [None]:
# dataset = Way of representing the dataset so it can be loaded with the dataloader

# dataloader = Used to load batches of data from our dataset

In [None]:
first_data = dataset[93][54] # First item of the first batch of 32 windows (which all correspond to a SINGLE shotno)
# First number is the shotno, the second one is which window we are accessing for that shotno.

for key, item in first_data.items():
    if isinstance(item, (list, np.ndarray, torch.Tensor)):
        if isinstance(item, (np.ndarray, torch.Tensor)):
            print(key, item.shape)
        else:
            print(key, len(item))
    elif isinstance(item, int):
        print(key, item)

In [None]:
idx_shotno = np.random.randint(1,94) # Choose a random experiment
idx_windowno = np.random.randint(1,batch_size)
# Plot a random window (even frequencies) from a random shotno
random_sample = dataset[idx_shotno][idx_windowno]
print(f"Experiment number: {random_sample['shotno']}, and window number: {idx_windowno}")

plot_spectrogram(random_sample["window_odd"], title = "Random window (even frequencies) from a random shotno",\
                time = random_sample["time"], frequency = random_sample["frequency"])

### Let's verify that this is correct using the real data

In [None]:
data_shot = load_shot(random_sample['shotno'], DATA_PATH, FILE_EXT)

# Extracting inputs
inputs = data_shot["x"]["spectrogram"]
spec_even = inputs["EvenN"]
spec_odd = inputs["OddN"]
f = inputs["frequency"]
t = inputs["time"]

In [None]:
plot_spectrogram(spec_even, "Even N", t, f)