In [1]:
import h5py
import os
import numpy as np
import signal_processing as sp
import matplotlib.pyplot as plt
import time
import sklearn
from tensorflow import keras
from tensorflow.keras import layers
from fastprogress.fastprogress import progress_bar
from sys import getsizeof

In [2]:
# Get location of DEEPSIG DATASET: RADIOML 2018.01A dataset
current_working_directory = os.getcwd()
file_name = os.path.join(current_working_directory, "DeepSig-Dataset-2018/GOLD_XYZ_OSC.0001_1024.hdf5")

In [3]:
# Load File
start = time.time()
print("Loading dataset...")
f = h5py.File(file_name, 'r')

# Get the dataset from each key 
x = f["X"] # Dataset
y = f["Y"] # Labels
z = f["Z"] # SNR Value

print(f"Dataset loaded in {time.time() - start:.6f} seconds.")

Loading dataset...
Dataset loaded in 0.004881 seconds.


In [4]:
# It is significantly faster to reshape an array than to concatenate multiple arrays. 
# Start be creating an empty array of a known size, where the first axis is the slices of the dataset we are keeping
# Then reassign the elements of this empty array to be the slices of the larger array
# Finally, reshape the array to the final shape

start_time = time.time()

# Use np.where to find indices where SNR is greater than the desired value.
print("Slicing dataset with SNR >= 10")
snrindex = np.where(np.array(z) >= 10)
num_samples_per_modulation_type = 106496

start_of_snr_slice = snrindex[0][0]
num_of_samples_with_desired_snr = num_samples_per_modulation_type - start_of_snr_slice

# For our dataset, x, we want to pick out the data where the SNR is greater than or equal to a certain SNR
# The datatype of the X dataset is float32. This is important.
# While the original datatype of Y is int64, use int16 to save a tiny bit of memory.
dataset_slices = np.empty((24, num_of_samples_with_desired_snr, 1024, 2), dtype=np.float32)
labels_slices = np.empty((24, num_of_samples_with_desired_snr, 24), dtype = np.int16)

# We can then iterate over each of the modulation types
# The numbers used in these are specifically chosen to pick out only SNR >= 0 values. 
for i in progress_bar(range(24)):
    start = start_of_snr_slice + num_samples_per_modulation_type*i
    end = 106496 + num_samples_per_modulation_type*i
    dataset_slices[i] = np.array(x[start:end])
    labels_slices[i] = np.array(y[start:end])

# We can then reshape this array to the final shape, which is signfically faster than concatenation
dataset = np.reshape(dataset_slices, (-1, 1024, 2))
labels = np.reshape(labels_slices, (-1, 24))

# Remove slices from memory
del dataset_slices
del labels_slices

print(f"Datasets created in {time.time() - start_time:.6f} seconds.")

# For fun, I timed the process of this method versus the old method using concatenation, and it is about 3 times faster. Plus, we get to see the progress, whereas concatenation just runs with no progress indicators. Reshaping is nearly instant. 

Slicing dataset with SNR >= 10


In [7]:
# Randomly shuffle dataset
shuffler = np.random.permutation(len(labels))
dataset_shuffled = np.empty(shape = np.shape(fft_dataset), dtype = dataset.dtype)
labels_shuffled = np.empty(shape = np.shape(labels), dtype = labels.dtype)
for i, j in enumerate(progress_bar(shuffler)):
    dataset_shuffled[i] = dataset[j]
    labels_shuffled[i] = labels[j]

del dataset
del labels

In [8]:
# Split dataset into train, validation, and test sets
# Use about 50% for training, 20% for val, 30% for test

total_num_elements = len(labels_shuffled)

dataset_train = dataset_shuffled[:int(0.5 * total_num_elements)]
dataset_val = dataset_shuffled[int(0.5 * total_num_elements):int(0.7 * total_num_elements)]
dataset_test = dataset_shuffled[int(0.7 * total_num_elements):]

labels_train = labels_shuffled[:int(0.5 * total_num_elements)]
labels_val = labels_shuffled[int(0.5 * total_num_elements):int(0.7 * total_num_elements)]
labels_test = labels_shuffled[int(0.7 * total_num_elements):]

print(f"train contains {len(labels_train)} elements.")
print(f"val contains {len(labels_val)} elements")
print(f"test contains {len(labels_val)} elements")

del dataset_shuffled
del labels_shuffled

train contains 50.00% of the total dataset.
val contains 20.00% of the total dataset.
test contains 30.00% of the total dataset.


In [9]:
# Save shuffled datasets
save_path = os.path.join(current_working_directory, "DeepSig-Dataset-2018/")
np.save(os.path.join(save_path, "iq_dataset_train.npy"), dataset_train)
np.save(os.path.join(save_path, "iq_dataset_val.npy"), dataset_val)
np.save(os.path.join(save_path, "iq_dataset_test.npy"), dataset_test)

np.save(os.path.join(save_path, "iq_labels_train.npy"), labels_train)
np.save(os.path.join(save_path, "iq_labels_val.npy"), labels_val)
np.save(os.path.join(save_path, "iq_labels_test.npy"), labels_test)

print("Successfully saved all datasets.")

Successfully saved all datasets.


In [None]:
print(f"Train dataset has size {getsizeof(dataset_train)} bytes.")
print(f"Validation dataset has size {getsizeof(dataset_val)} bytes.")
print(f"Test dataset has size {getsizeof(dataset_test)} bytes.")
