## Importing Libraries and Loading Preprocessed Data

In [1]:
from utils import *

In [2]:
# Loading normalized eigenvalues
with open ("_preproc_data/df_normalized_eigenvalues.pkl", 'rb') as f:
    df_normalized_eigenvalues = pickle.load(f)

# Loading normalized eigenvalue labels
with open ("_preproc_data/normalized_ground_truth_data.pkl", 'rb') as f:
    normalized_ground_truth_data = pickle.load(f)
    
# Extract the labels and the damping ratio
tag_labels = normalized_ground_truth_data['tag_label']
damping_ratio = normalized_ground_truth_data['damping_ratio']

## Separating the Eigenvalues into Real and Imaginary Part
This must be done since TF does not support backpropagation with complex-valued weights (reference: [here](https://stackoverflow.com/questions/47721615/how-to-backpropagate-with-complex-valued-weights))

In [14]:
n_eigs = df_normalized_eigenvalues.shape[0]
n_scenarios = df_normalized_eigenvalues.shape[1]

eigs_flatten = np.reshape(np.transpose(df_normalized_eigenvalues.values),
                         [n_eigs * n_scenarios, 1])
print(f"eigs_flatten (shape) = {eigs_flatten.shape}")

# ------------------------------
# INPUT
# ------------------------------

# Initialize input features matrix
input_features = np.zeros(shape = (eigs_flatten.shape[0], 2))

# Populate the input features matrix with real and imaginary part of the eigenvalues
input_features[:, 0] = np.reshape(np.real(eigs_flatten), [eigs_flatten.shape[0], ])
input_features[:, 1] = np.reshape(np.imag(eigs_flatten), [eigs_flatten.shape[0], ])
print(f"input_features (shape) = {input_features.shape}")

# Flattening tag labels (ground truth)
input_labels = np.reshape(np.transpose(tag_labels), 
                          [tag_labels.shape[0] * tag_labels.shape[1], 1])
print(f"input_labels (shape) = {input_features.shape}")

eigs_flatten (shape) = (970935, 1)
input_features (shape) = (970935, 2)
input_labels (shape) = (970935, 2)


## Splitting data into Training and Testing Datasets

In [15]:
# INPUTS: input_features
# LABELS (of inputs): input_labels

# Extracting training and testing data using scikit-learn
X_train, X_test, Y_train, Y_test = train_test_split(input_features, 
                                                    input_labels, 
                                                    random_state = 0)

print(f"Train data (shape) = {X_train.shape}")
print(f"Train labels (shape) = {Y_train.shape}")
print(f"Test data (shape) = {X_test.shape}")
print(f"Test data (shape) = {Y_test.shape}")

Train data (shape) = (728201, 2)
Train labels (shape) = (728201, 1)
Test data (shape) = (242734, 2)
Test data (shape) = (242734, 1)


In [16]:
Y_train[0]

array([5.])

## Converting the Labels to One-Hot Encoding

In [20]:
# Converting labels to one-hot using scikit-learn
lb = LabelBinarizer()

T_train = np.float64(lb.fit_transform(Y_train))
T_train = np.float64(lb.transform(Y_train))

T_test = np.float64(lb.fit_transform(Y_test))
T_test = np.float64(lb.transform(Y_test))

print(f"Train labels (one-hot encoding - shape): {T_train.shape}")
print(f"Test labels (one-hot encoding - shape): {T_test.shape}")

Train labels (one-hot encoding - shape): (728201, 6)
Test labels (one-hot encoding - shape): (242734, 6)


In [21]:
T_train[0]

array([0., 0., 0., 0., 1., 0.])

## Compensating Skewed Dataset

In [23]:
# Taking the same amount of eigenvalues for each category
unique, counts = np.unique(Y_train, return_counts = True)
print(dict(zip(unique, counts)))

{1.0: 36843, 2.0: 25312, 3.0: 5686, 4.0: 207371, 5.0: 439513, 6.0: 13476}


In [29]:
# Getting the indices for the elements with the same category in the training data
numeigs_1 = np.where(Y_train == 1)
numeigs_1 = np.array(numeigs_1)
numeigs_1 = numeigs_1[0, :]

numeigs_2 = np.where(Y_train == 2)
numeigs_2 = np.array(numeigs_2)
numeigs_2 = numeigs_2[0, :]

numeigs_3 = np.where(Y_train == 3)
numeigs_3 = np.array(numeigs_3)
numeigs_3 = numeigs_3[0, :]

numeigs_4 = np.where(Y_train == 4)
numeigs_4 = np.array(numeigs_4)
numeigs_4 = numeigs_4[0, :]

numeigs_5 = np.where(Y_train == 5)
numeigs_5 = np.array(numeigs_5)
numeigs_5 = numeigs_5[0, :]

numeigs_6 = np.where(Y_train == 6)
numeigs_6 = np.array(numeigs_6)
numeigs_6 = numeigs_6[0, :]

In [31]:
# Randomizing indices
numeigs_r_1 = np.random.choice(numeigs_1, np.amin(counts))
numeigs_r_2 = np.random.choice(numeigs_2, np.amin(counts))
numeigs_r_3 = np.random.choice(numeigs_3, np.amin(counts))
numeigs_r_4 = np.random.choice(numeigs_4, np.amin(counts))
numeigs_r_5 = np.random.choice(numeigs_5, np.amin(counts))
numeigs_r_6 = np.random.choice(numeigs_6, np.amin(counts))

## Creating Reduced Training/Testing Datasets

In [36]:
X_train_red = np.concatenate((X_train[numeigs_r_1, ...],
                             X_train[numeigs_r_2, ...], X_train[numeigs_r_3, ...], 
                             X_train[numeigs_r_4, ...], X_train[numeigs_r_5, ...], 
                             X_train[numeigs_r_6, ...]), axis = 0)
print(f" X_train_red (shape) = {X_train_red.shape}")

Y_train_red = np.concatenate((Y_train[numeigs_r_1, ...],
                             Y_train[numeigs_r_2, ...], Y_train[numeigs_r_3, ...], 
                             Y_train[numeigs_r_4, ...], Y_train[numeigs_r_5, ...], 
                             Y_train[numeigs_r_6, ...]), axis = 0)
print(f" Y_train_red (shape) = {Y_train_red.shape}")

T_train_red = np.concatenate((T_train[numeigs_r_1, ...],
                             T_train[numeigs_r_2, ...], T_train[numeigs_r_3, ...], 
                             T_train[numeigs_r_4, ...], T_train[numeigs_r_5, ...], 
                             T_train[numeigs_r_6, ...]), axis = 0)
print(f" T_train_red (shape) = {T_train_red.shape}")

 X_train_red (shape) = (34116, 2)
 Y_train_red (shape) = (34116, 1)
 T_train_red (shape) = (34116, 6)


In [38]:
normalized_testing_training_data_red = {'X_train' : X_train,
                                       'X_train_red' : X_train_red,
                                       'Y_train_red' : Y_train_red,
                                       'T_train_red' : T_train_red,
                                       'X_test' : X_test,
                                       'Y_test' : Y_test}

with open('_preproc_data/normalized_testing_training_data_red.pkl', 'wb') as f:
    pickle.dump(normalized_testing_training_data_red, f, pickle.HIGHEST_PROTOCOL)