In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

In [3]:
seeds = np.arange(100)

In [4]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [5]:
col_names = data.columns
X = data[col_names[3:]]
y = data[col_names[2]]

In [10]:
X_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
X_scaled.columns = X.columns

In [8]:
selected_cols = []

In [9]:
for seed in seeds:
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True, random_state=seed)
    selector = RFE(rf, n_features_to_select=5, step=0.1).fit(X_scaled, y)
    selected_cols.append([X.columns[i] for i in range(len(selector.support_)) if selector.support_[i]])

In [11]:
selected_cols

[['P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'TDB04s', 'Mor04m', 'CATS3D_02_AP', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor14s', 'E1p', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'E1p', 'CATS3D_02_AP', 'LUMO / eV'],
 ['P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'E1p', 'H3m', 'LUMO / eV'],
 ['MATS5v', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'E1p', 'CATS3D_02_AP', 'LUMO / eV'],
 ['P_VSA_MR_5', 'TDB04s', 'Mor04m', 'E1p', 'LUMO / eV'],
 ['MATS5v', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO / eV', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'Mor14s', 'E1p', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'E1p', 'CATS3D_02_AP', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'E1p', 'CATS3D_02_AP', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO /

In [15]:
vals, counts = np.unique(selected_cols, axis=0, return_counts=True)

In [16]:
counts

array([ 1,  2,  8,  1,  3,  1, 40,  1, 16,  1,  8,  3,  2,  1,  1,  1,  3,
        1,  1,  3,  2], dtype=int64)

In [17]:
vals

array([['GATS4s', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'LUMO / eV'],
       ['GATS5v', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'LUMO / eV'],
       ['MATS5v', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'LUMO / eV'],
       ['MATS5v', 'P_VSA_MR_5', 'Mor04m', 'HOMO / eV', 'LUMO / eV'],
       ['P_VSA_LogP_2', 'P_VSA_MR_5', 'Mor04m', 'E1p', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor04m', 'CATS3D_02_AP', 'HOMO / eV', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor04m', 'E1p', 'CATS3D_02_AP', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor04m', 'E1p', 'H3m', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor04m', 'E1p', 'HOMO / eV', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor04m', 'E1p', 'R5p+', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor04m', 'Mor14s', 'E1p', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor04m', 'Mor22s', 'E1p', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor04m', 'Mor29v', 'E1p', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor11u', 'Mor14u', 'Mor04m', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor14s', 'E1p', 'CATS3D_02_AP', 'LUMO / eV'],
       ['P_VSA_MR_