In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

In [2]:
seeds = np.arange(100)
random_state=42

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
X = data[col_names[3:]]
y = data[col_names[2]]

In [5]:
X_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
X_scaled.columns = X.columns

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.1, random_state=random_state)

In [7]:
selected_cols = []

In [8]:
for seed in seeds:
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True, random_state=seed)
    selector = RFE(rf, n_features_to_select=6, step=0.1).fit(X_train, y_train)
    selected_cols.append([X.columns[i] for i in range(len(selector.support_)) if selector.support_[i]])

In [9]:
selected_cols

[['MATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s', 'LUMO / eV'],
 ['P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor14s', 'Mor22s', 'CATS3D_02_AP', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor14u', 'Mor04m', 'Mor29v', 'Mor14s', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor14s', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['GATS5v', 'P_VSA_MR_5', 'Mor14u', 'Mor04m', 'Mor22s', 'E1p'],
 ['MATS5v', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'R5p+', 'LUMO / eV'],
 ['P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'CATS3D_02_AP', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'R5p+', 'LUMO / eV'],
 ['P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor29v', 'Mor22s', 'E1p', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['

In [10]:
vals, counts = np.unique(selected_cols, axis=0, return_counts=True)

In [11]:
counts

array([ 2,  1,  1,  1,  1,  1,  1,  3,  1,  1,  1,  1,  1,  4,  1,  6,  1,
        1, 11,  1,  3,  1,  1,  1,  1,  2,  3,  1,  1,  2,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  8,  1,  1,  1,  3,  1,  1,  2,  1,  4,  1,  1,
        3,  1,  1,  1,  1,  1,  1], dtype=int64)

In [12]:
vals

array([['GATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s',
        'LUMO / eV'],
       ['GATS5v', 'P_VSA_MR_5', 'Mor04m', 'Mor19m', 'E1p', 'LUMO / eV'],
       ['GATS5v', 'P_VSA_MR_5', 'Mor04m', 'Mor29v', 'Mor22s',
        'LUMO / eV'],
       ['GATS5v', 'P_VSA_MR_5', 'Mor14u', 'Mor04m', 'Mor22s', 'E1p'],
       ['MATS5m', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'E1p',
        'LUMO / eV'],
       ['MATS5m', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s',
        'LUMO / eV'],
       ['MATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'E1p',
        'LUMO / eV'],
       ['MATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s',
        'LUMO / eV'],
       ['MATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor29v',
        'LUMO / eV'],
       ['MATS5v', 'P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor14u', 'Mor04m',
        'LUMO / eV'],
       ['MATS5v', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO / eV',
        'LUMO / eV'],
       ['MATS5v', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'R5p+', 'LUMO / 

In [13]:
vals[np.argmax(counts)]

array(['P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p',
       'LUMO / eV'], dtype='<U12')

In [14]:
y_valid

0     0.328
5     0.896
36    0.817
45    0.765
13    0.893
54    0.733
Name: LinIE ZE41, dtype: float64

In [17]:
vals[counts.tolist().index(8)]

array(['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'HOMO / eV', 'LUMO / eV'],
      dtype='<U12')