In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
seed = 13
tf.random.set_seed(seed)

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
X = data[col_names[3:]].astype('float32')
y = data[col_names[2]].astype('float32')

In [5]:
X_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(X)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.1, random_state=seed)
X_train = pd.DataFrame(X_train)
X_train.columns = X.columns

In [7]:
class Autoencoder(keras.models.Model):
    def __init__(self, latent_dim=5):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim   
        self.encoder = tf.keras.Sequential([
          keras.layers.Dense(latent_dim, activation='relu'),
        ])
        self.decoder = tf.keras.Sequential([
          keras.layers.Dense(len(X_train.columns), activation='linear')
        ])
    
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [8]:
autoencoder = Autoencoder(latent_dim=5)

In [9]:
autoencoder.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.005),
    loss='mean_squared_error')

In [10]:
autoencoder.fit(X_train, X_train, validation_split=0.1, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x2299682d648>

In [11]:
best_cols = ['P_VSA_MR_5', 'Mor04m', 'E1p', 'CATS3D_02_AP', 'LUMO / eV']

In [12]:
X_sel = X_train[best_cols]

In [13]:
encoded = autoencoder.encoder(X_train.to_numpy()).numpy()

In [14]:
for i in range(5):
    print(sorted(X_sel.to_numpy()[i,:]), sorted(encoded[i, :]))

[0.0, 0.40519476, 0.43697143, 0.6193462, 0.66175306] [0.0, 0.0, 0.0, 0.0, 6.850691]
[0.3766234, 0.41165376, 0.5, 0.5158979, 0.56289303] [0.0, 0.0, 0.0, 0.0, 5.0634174]
[0.0, 0.023376644, 0.054961618, 0.5167059, 0.6462159] [0.0, 0.0, 0.0, 0.0, 5.8900356]
[0.0, 0.0, 0.18181819, 0.32409588, 0.567846] [0.0, 0.0, 0.0, 0.0, 5.499934]
[0.0, 0.0, 0.36883116, 0.48365432, 0.58608484] [0.0, 0.0, 0.0, 0.0, 1.4722421]


In [16]:
['P_VSA_MR_5'], 79/100
['P_VSA_MR_5', 'LUMO / eV'], 85/100
['P_VSA_MR_5', 'LUMO / eV', 'Mor04m'], 73/100
['P_VSA_MR_5', 'LUMO / eV', 'Mor04m', 'E1p'], 79/100
['P_VSA_MR_5', 'LUMO / eV', 'Mor04m', 'E1p', 'CATS3D_02_AP'], 40/100
['P_VSA_MR_5', 'LUMO / eV', 'Mor04m', 'E1p', 'CATS3D_02_AP', 'HOMO / eV'], 12/100

(['P_VSA_MR_5', 'LUMO / eV', 'Mor04m', 'E1p', 'CATS3D_02_AP', 'HOMO / eV'],
 0.12)