In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap

In [2]:
df = pd.read_csv("./genetic_algorithm/train.csv")

In [4]:
target = 'Survived'
predictors = list(set(df._get_numeric_data().columns) - set([target]))

In [5]:
df[predictors] = df[predictors].fillna(0)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df[predictors], df[target], test_size = 0.2, random_state=42)

In [142]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap

class RandomFeatureSelector:
    """
    Feature selection escolhendo subset de variáveis que tem importância maior que uma variávei aleatória.
    O classificador padrão é um Random forest classifier. 
    Classe construída para problemas de classificação binária.
    
    Author:
        Caio Martins Ramos de Oliveira
    
    Since:
        2019-12    
    """
    def __init__(self,
                 rand_var_type = 'integer',
                 low_end=0,
                 high_end=10,
                 clf=None,
                 importance_method = 'shap',
                 random_state = None
                ):
        """
        Incializador da classe Random Feature Selector.
        
        Args:
            rand_var_type:
            low_end:
            high_end:
            clf:
            importance_method:
            random_state:

        Author:
            Caio Martins Ramos de Oliveira

        Since:
            2019-12    
        """
        if random_state != None:
            np.random.seed(random_state)        
        if clf == None:
            self.clf = RandomForestClassifier(class_weight='balanced',
                                              n_estimators=50,
                                              max_depth=5,
                                              random_state=42,
                                              n_jobs=-1)
        self.rand_var_type = rand_var_type
        self.importance_method = importance_method
        self.low_end = low_end
        self.high_end = high_end
    
    def _gen_rand_var_name(self, X):
        col = 'rand_var_{}'
        counter = 0
        while col.format(str(counter)) in X.columns:
            counter += 1
        self.rand_var_name = col.format(str(counter))
        
    def _random_function_selector(self, X):
        if self.rand_var_type == 'integer':
            X[self.rand_var_name] = np.random.randint(self.low_end, self.high_end+1, size=X.shape[0])
        elif self.rand_var_type == 'float':
            X[self.rand_var_name] = (self.high_end + self.low_end)*np.random.random(size=X.shape[0]) - self.low_end
        else:
            raise AttributeError('Tipo de variável randômica inválida.')
            
    def _random_feature_selector(self, X, y):
        if self.importance_method == 'shap':
            shap_vals = shap.TreeExplainer(self.clf).shap_values(X, y)
            importance_matrix = np.dstack((X.columns,np.abs(shap_vals[1]).mean(axis=0)))[0]
        elif self.importance_method == 'randomforest':
            importance_matrix = np.dstack((X.columns, self.clf.feature_importances_))[0]
        feature_imporances = pd.DataFrame(np.dstack((X.columns, self.clf.feature_importances_))[0],
                           columns=['feature','importance']).set_index('feature')
        rand_importance = feature_imporances.loc[self.rand_var_name][0]
        self.features_selected = list(feature_imporances.query('importance > @rand_importance').index) 

    def fit(self, X, y):
        self._gen_rand_var_name(X)
        self._random_function_selector(X)
        self.clf.fit(X, y)
        self._random_feature_selector(X, y)
        X.drop(self.rand_var_name, axis = 1, inplace=True)
        
    def transform(self, X, y=None):
        X = X[self.features_selected]
        return X
        
    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X, y)

In [161]:
rfs = RandomFeatureSelector(importance_method='shap', high_end=50)
rfs.fit(X_train, y_train)

In [165]:
rfs.features_selected

In [164]:
rfs.fit_transform(X_train, y_train).head()

Unnamed: 0,PassengerId,Pclass,Fare
331,332,1,28.5
733,734,2,13.0
382,383,3,7.925
704,705,3,7.8542
813,814,3,31.275
