In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
random_state = 42
tf.random.set_seed(random_state)
n_features = 3

In [3]:
rfe_sel = ['P_VSA_MR_5', 'P_VSA_LogP_2', 'Mor22s', 'Mor04m', 'LUMO / eV', 'E1p', 'HOMO / eV']

In [4]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [5]:
col_names = data.columns
X = data[rfe_sel[:n_features]].astype('float32')
y = data[col_names[1]].astype('float32')

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=random_state)
[X_train, X_valid, y_train, y_valid] = [pd.DataFrame(x) for x in [X_train, X_valid, y_train, y_valid]]

In [7]:
scalex = MinMaxScaler(feature_range=(-1,1))
scalex.fit(X_train)
[X_train_sc, X_valid_sc] = [pd.DataFrame(scalex.transform(x), columns=X.columns) for x in [X_train, X_valid]]

In [8]:
scaley = MinMaxScaler(feature_range=(0, 1))
scaley.fit(y_train)
[y_train_sc, y_valid_sc] = [pd.DataFrame(scaley.transform(y), columns=y.columns) for y in [y_train, y_valid]]

In [9]:
class Autoencoder(keras.models.Model):
    def __init__(self, latent_dim=2):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim   
        self.encoder = tf.keras.Sequential([
            #keras.layers.Dense(150, activation='linear'),
            #keras.layers.LeakyReLU(),
            keras.layers.Dense(latent_dim, activation='linear'),
            keras.layers.LeakyReLU(),
        ])
        self.decoder = tf.keras.Sequential([
            #keras.layers.Dense(150, activation='linear'),
            #keras.layers.LeakyReLU(),
            keras.layers.Dense(len(X_train.columns), activation='linear')
        ])
    
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [24]:
autoencoder = Autoencoder(latent_dim=2)

In [25]:
autoencoder.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.01),
    loss='mean_squared_error')

In [26]:
history = autoencoder.fit(X_train_sc, X_train_sc, validation_data=(X_valid_sc, X_valid_sc), epochs=75)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


In [27]:
encoded = autoencoder.encoder(X_valid_sc.to_numpy()).numpy()

In [28]:
np.min(encoded, axis=0), np.mean(encoded, axis=0), np.max(encoded, axis=0)

(array([-0.26945093,  0.6648216 ], dtype=float32),
 array([0.45800173, 1.0820074 ], dtype=float32),
 array([1.3059565, 1.6485028], dtype=float32))

In [29]:
np.where(np.all(np.isclose(encoded, 0), axis=0))[0]

array([], dtype=int64)

In [30]:
encoded

array([[ 0.23157957,  1.1433657 ],
       [ 0.46913555,  1.1213837 ],
       [-0.26945093,  1.6485028 ],
       [ 0.6233543 ,  1.0654129 ],
       [ 1.3059565 ,  0.84855735],
       [ 0.38743547,  0.6648216 ]], dtype=float32)

In [31]:
encoded.shape

(6, 2)