In [6]:
import h5py
import os
import numpy as np
import signal_processing as sp
import matplotlib.pyplot as plt
import time
import sklearn
from tensorflow import keras
from tensorflow.keras import layers
from fastprogress.fastprogress import progress_bar
from sys import getsizeof

In [7]:
# Get location of DEEPSIG DATASET: RADIOML 2018.01A dataset
current_working_directory = os.getcwd()
file_name = os.path.join(current_working_directory, "DeepSig-Dataset-2018/GOLD_XYZ_OSC.0001_1024.hdf5")

In [8]:
# Load File
start = time.time()
print("Loading dataset...")
f = h5py.File(file_name, 'r')

# Get the dataset from each key 
x = f["X"] # Dataset
y = f["Y"] # Labels
z = f["Z"] # SNR Value

print(f"Dataset loaded in {time.time() - start:.6f} seconds.")

Loading dataset...
Dataset loaded in 0.001320 seconds.


In [12]:
# It is significantly faster to reshape an array than to concatenate multiple arrays. 
# Start be creating an empty array of a known size, where the first axis is the slices of the dataset we are keeping
# Then reassign the elements of this empty array to be the slices of the larger array
# Finally, reshape the array to the final shape

num_samples_per_modulation_type = 106496
start_of_snr_slice = snrindex[0][0]
num_of_samples_with_desired_snr = num_samples_per_modulation_type - start_of_snr_slice

# Use np.where to find indices where SNR is greater than the desired value.
print("Slicing dataset with SNR >= 10")
snrindex = np.where(np.array(z) >= 10)

start_time = time.time()

# For our dataset, x, we want to pick out the data where the SNR is greater than or equal to a certain SNR
# The datatype of the X dataset is float32. This is important.
# While the original datatype of Y is int64, use int16 to save a tiny bit of memory.
dataset_slices = np.empty((24, num_of_samples_with_desired_snr, 1024, 2), dtype=np.float32)
labels_slices = np.empty((24, num_of_samples_with_desired_snr, 24), dtype = np.int16)

# We can then iterate over each of the modulation types
# The numbers used in these are specifically chosen to pick out only SNR >= 0 values. 
for i in progress_bar(range(24)):
    start = start_of_snr_slice + num_samples_per_modulation_type*i
    end = 106496 + num_samples_per_modulation_type*i
    dataset_slices[i] = np.array(x[start:end])
    labels_slices[i] = np.array(y[start:end])

# We can then reshape this array to the final shape, which is signfically faster than concatenation
dataset = np.reshape(dataset_slices, (-1, 1024, 2))
labels = np.reshape(labels_slices, (-1, 24))

# Remove slices from memory
del dataset_slices
del labels_slices

print(f"Datasets created in {time.time() - start_time:.6f} seconds.")

# For fun, I timed the process of this method versus the old method using concatenation, and it is about 3 times faster. Plus, we get to see the progress, whereas concatenation just runs with no progress indicators. Reshaping is nearly instant. 

Slicing dataset with SNR >= 10


In [5]:
# The next step is to convert our dataset to complex values.
# We can use our empty dataset method to speed things up to avoid concatenation
print("Converting dataset to complex values...")
start = time.time()

dataset_complex = np.empty((len(dataset), 1024), dtype = np.complex64)
for i in progress_bar(range(len(dataset))):
    # Flatten the dataset, to be in I/Q format that our signal_to_complex function expects
    dataset_complex[i] = sp.signal_to_complex(np.reshape(dataset[i], (-1, )))

print(f"Converted dataset to complex values in {time.time() - start:.6f} seconds.")

# We can then remove the original dataset from memory to save precious memory
del dataset

Converting dataset to complex values...


In [6]:
# Now we need to take a FFT of the complex dataset
print("Calculating FFT of dataset...")
start = time.time()

fft_dataset = np.empty((len(dataset_complex), 1024), dtype = np.float64)
for i in progress_bar(range(len(dataset_complex))):
    fft_dataset[i] = sp.fft_signal(dataset_complex[i])

print(f"Completed FFT calculaations in {time.time() - start:.6f} seconds.")

# Remove the complex dataset from memory
del dataset_complex

Calculating FFT of dataset...


In [10]:
# Randomly shuffle dataset
shuffler = np.random.permutation(len(labels))
fft_dataset_shuffled = np.empty(shape = np.shape(fft_dataset), dtype = fft_dataset.dtype)
labels_shuffled = np.empty(shape = np.shape(labels), dtype = labels.dtype)
for i, j in enumerate(progress_bar(shuffler)):
    fft_dataset_shuffled[i] = fft_dataset[j]
    labels_shuffled[i] = labels[j]

del fft_dataset
del labels

In [13]:
# Split dataset into train, validation, and test sets
# 1572864 elements total
# Use about 50% for training, 20% for val, 30% for test
# [:800_000]
# [800_000:1_100_000]
# [1_100_000:]

fft_dataset_train = fft_dataset_shuffled[:800_000]
fft_dataset_val = fft_dataset_shuffled[800_000:1_100_000]
fft_dataset_test = fft_dataset_shuffled[1_100_000:]

labels_train = labels_shuffled[:800_000]
labels_val = labels_shuffled[800_000:1_100_000]
labels_test = labels_shuffled[1_100_000:]

print(f"train contains {len(fft_dataset_train)/len(fft_dataset_shuffled) * 100:.2f}% of the total dataset.")
print(f"val contains {len(fft_dataset_val)/len(fft_dataset_shuffled) * 100:.2f}% of the total dataset.")
print(f"test contains {len(fft_dataset_test)/len(fft_dataset_shuffled) * 100:.2f}% of the total dataset.")

del fft_dataset_shuffled
del labels_shuffled

NameError: name 'fft_dataset_shuffled' is not defined

In [14]:
# Save shuffled FFT datasets
save_path = os.path.join(current_working_directory, "DeepSig-Dataset-2018/")
np.save(os.path.join(save_path, "fft_dataset_train.npy"), fft_dataset_train)
np.save(os.path.join(save_path, "fft_dataset_val.npy"), fft_dataset_val)
np.save(os.path.join(save_path, "fft_dataset_test.npy"), fft_dataset_test)

np.save(os.path.join(save_path, "labels_train.npy"), labels_train)
np.save(os.path.join(save_path, "labels_val.npy"), labels_val)
np.save(os.path.join(save_path, "labels_test.npy"), labels_test)

print("Successfully saved all datasets.")

Successfully saved all datasets.
