In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
import tensorflow as tf
from tensorflow import keras
import pickle

In [2]:
tf.keras.backend.set_floatx('float64')

In [3]:
random_state = 42
n_features_to_select = 5
n_runs = 100
seeds = np.arange(n_runs)
path = '..\\Data\\'
csv_path = path + 'ze41_mol_desc_db_red.csv'
df_path = path + 'predictions_cls_seed{}_feat{}_runs{}.csv'.format(random_state, n_features_to_select, n_runs)
rfe_path = path + 'rfe_cls_res_seed{}_feat{}_runs{}.pkl'.format(random_state, n_features_to_select, n_runs)

In [4]:
data = pd.read_csv(csv_path, header=0, sep=';', decimal=',')
col_names = data.columns
X = data[col_names[3:]]
y = data[col_names[1]]

In [5]:
def get_class(x):
    if x < -40:
        return 0
    if x < 0:
        return 1
    if x < 40:
        return 2
    return 3

In [6]:
y = y.apply(get_class)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=random_state)
[X_train, X_valid, y_train, y_valid] = [pd.DataFrame(x) for x in [X_train, X_valid, y_train, y_valid]]

scalex = MinMaxScaler(feature_range=(-1,1))
scalex.fit(X_train)
[X_train_sc, X_valid_sc] = [pd.DataFrame(scalex.transform(x), columns=X.columns) for x in [X_train, X_valid]]

In [8]:
selected_cols = []

In [9]:
for seed in seeds:
    rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True, random_state=seed)
    selector = RFE(rf, n_features_to_select=n_features_to_select, step=0.1).fit(X_train_sc, np.ravel(y_train))
    selected_cols.append([X.columns[i] for i in range(len(selector.support_)) if selector.support_[i]])
    if seed%10 == 9:
        print('RFE {:.2f}% done'.format((seed+1)*100/len(seeds)))

RFE 10.00% done
RFE 20.00% done
RFE 30.00% done
RFE 40.00% done
RFE 50.00% done
RFE 60.00% done
RFE 70.00% done
RFE 80.00% done
RFE 90.00% done
RFE 100.00% done


In [10]:
vals, counts = np.unique(selected_cols, axis=0, return_counts=True)
best_features = vals[np.argmax(counts)]
best_features

array(['P_VSA_MR_5', 'Mor14u', 'Mor04m', 'HOMO / eV', 'LUMO / eV'],
      dtype='<U14')

In [22]:
def get_model(n_input_features):
    model = keras.models.Sequential([
        keras.layers.GaussianNoise(stddev=0.1),
        keras.layers.Dense(50, activation='relu', input_shape=(n_input_features,)),
        keras.layers.Dense(20, activation='relu'),
        keras.layers.Dense(10, activation='relu'),
        keras.layers.Dense(4, activation='softmax')
        ])
    
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.001),
        loss=keras.losses.sparse_categorical_crossentropy,
        metrics=['accuracy'])
    return model

In [23]:
model = get_model(n_features_to_select)

In [28]:
history = model.fit(X_train_sc[best_features], y_train, validation_data=(X_valid_sc[best_features], y_valid),
                    epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
y_pred = model.predict(tf.convert_to_tensor(X_valid_sc[best_features]))

In [30]:
np.argmax(y_pred, axis=1)

array([0, 3, 2, 2, 2, 2], dtype=int64)