In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

In [2]:
seeds = np.arange(100)
random_state=42

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
#col_names = col_names.drop(labels=['HOMO / eV', 'LUMO / eV'])
X = data[col_names[3:]]
y = data[col_names[1]]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=random_state)
[X_train, X_valid, y_train, y_valid] = [pd.DataFrame(x) for x in [X_train, X_valid, y_train, y_valid]]

In [6]:
scalex = MinMaxScaler(feature_range=(-1,1))
scalex.fit(X_train)
[X_train_sc, X_valid_sc] = [pd.DataFrame(scalex.transform(x), columns=X.columns) for x in [X_train, X_valid]]

In [7]:
scaley = MinMaxScaler(feature_range=(0, 1))
scaley.fit(y_train)
[y_train_sc, y_valid_sc] = [pd.DataFrame(scaley.transform(y), columns=y.columns) for y in [y_train, y_valid]]

In [8]:
selected_cols = []

In [9]:
for seed in seeds:
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True, random_state=seed)
    selector = RFE(rf, n_features_to_select=6, step=0.1).fit(X_train_sc, np.ravel(y_train_sc))
    selected_cols.append([X.columns[i] for i in range(len(selector.support_)) if selector.support_[i]])

In [10]:
selected_cols

[['MATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor14s', 'Mor22s', 'CATS3D_02_AP', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor14u', 'Mor04m', 'Mor29v', 'Mor14s', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor14u', 'Mor04m', 'Mor14s', 'HOMO / eV', 'LUMO / eV'],
 ['GATS5v', 'P_VSA_MR_5', 'Mor14u', 'Mor04m', 'Mor22s', 'E1p'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'R5p+', 'LUMO / eV'],
 ['P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'CATS3D_02_AP', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor29v', 'Mor22s', 'R5p+', 'LUMO / eV'],
 ['P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor29v', 'Mor22s', 'E1p', 'LUMO / eV'],
 ['MATS5m', 'P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'LUMO / eV'],
 ['P_VSA_MR_

In [11]:
vals, counts = np.unique(selected_cols, axis=0, return_counts=True)

In [12]:
counts

array([1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 5, 1, 1, 6, 1, 9, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 1, 7, 1, 7, 1, 1, 4, 3, 1, 1, 4, 1, 1, 1, 1,
       2, 5, 2, 1, 1, 1, 1, 1, 1], dtype=int64)

In [13]:
vals

array([['GATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s',
        'LUMO / eV'],
       ['GATS5v', 'P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'LUMO / eV'],
       ['GATS5v', 'P_VSA_MR_5', 'Mor04m', 'Mor29v', 'E1p', 'LUMO / eV'],
       ['GATS5v', 'P_VSA_MR_5', 'Mor04m', 'Mor29v', 'Mor22s',
        'LUMO / eV'],
       ['GATS5v', 'P_VSA_MR_5', 'Mor14u', 'Mor04m', 'Mor22s', 'E1p'],
       ['MATS5m', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s',
        'LUMO / eV'],
       ['MATS5m', 'P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'LUMO / eV'],
       ['MATS5m', 'P_VSA_MR_5', 'Mor14u', 'Mor04m', 'Mor14s',
        'LUMO / eV'],
       ['MATS5v', 'GATS5v', 'P_VSA_MR_5', 'Mor04m', 'Mor19m',
        'LUMO / eV'],
       ['MATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s',
        'LUMO / eV'],
       ['MATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor29v',
        'LUMO / eV'],
       ['MATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor14u', 'Mor04m',
        'LUMO / eV'],
       ['

In [14]:
vals[np.argmax(counts)]

array(['P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p',
       'LUMO / eV'], dtype='<U12')

In [15]:
y_valid

Unnamed: 0,inhibition efficiency ZE41 / %
0,-157
5,39
36,12
45,-6
13,38
54,-17


In [16]:
vals[counts.tolist().index(7)]

array(['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'CATS3D_02_AP',
       'LUMO / eV'], dtype='<U12')