In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

In [2]:
seeds = np.arange(100)
random_state=42

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
X = data[col_names[3:]]
y = data[col_names[2]]

In [5]:
X_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
X_scaled.columns = X.columns

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.1, random_state=random_state)

In [7]:
selected_cols = []

In [8]:
for seed in seeds:
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True, random_state=seed)
    selector = RFE(rf, n_features_to_select=2, step=0.1).fit(X_train, y_train)
    selected_cols.append([X.columns[i] for i in range(len(selector.support_)) if selector.support_[i]])

In [9]:
selected_cols

[['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor22s'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor22s'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'LUMO / eV'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'Mor04m'],
 ['P_VSA_MR_5', 'Mo

In [10]:
vals, counts = np.unique(selected_cols, axis=0, return_counts=True)

In [11]:
counts

array([ 5, 54, 39,  2], dtype=int64)

In [12]:
vals

array([['Mor04m', 'LUMO / eV'],
       ['P_VSA_MR_5', 'LUMO / eV'],
       ['P_VSA_MR_5', 'Mor04m'],
       ['P_VSA_MR_5', 'Mor22s']], dtype='<U10')

In [13]:
vals[np.argmax(counts)]

array(['P_VSA_MR_5', 'LUMO / eV'], dtype='<U10')