# Feature Selection Class (correlation-based)

In [None]:
import sklearn

In [None]:
class VariableSelector(sklearn.base.BaseEstimator):
    '''Variable Selector, based on intracorrelation. 
    
    Select subset of variables based on intracorrelations. Default method is 'efficient'. Variable selection
    is typically used for dimensionality reduction, and/or to reduce correlation within covariates.
    
    Parameters
    ----------
    threshold : int
                Selected variables will have correlation coefficients lower than threshold, given that number of
                remaining variables is larger than number specified in "minimum". Default value is 0.3.  
    minimum : int
              Minimum number of variables that must remain in the subset regardless of maximum correlation coefficient
              specified in "threshold".
    method : str, 'efficient' or 'slowdescent'
    
    '''
    
    
    def __init__(self, threshold=0.3, minimum=5, method = 'efficient'):
        
        self.threshold = threshold
        self.minimum = minimum
        self.method = method
    
    
    
    def fit(self,X,y):
        '''Select variables using method "efficient" or "slowdescent".'''
        
        if self.method == 'efficient':
            self.columns, self.results = self.get_variable_selection_efficient(X,self.threshold)            
            
        elif self.method == 'slowdescent':
            self.columns, self.results = self.get_variable_selection_correlation(X,y, self.threshold, self.minimum)
        
        return self

    
    
    def transform(self, X):
        '''Subset dataframe.
        
        Parameters
        ----------
        X : dataset of regressors
        
        
        Returns
        -------
        list : list of selected variables
        dicc : dictionary indicating method, the maximum correlation coefficient in subset, 
               selected columns, number of selected columns, dropped columns (not selected), 
               and number of dropped columns
               
               
        '''
        
        X = X[self.columns]
        
        return X
    
    
    
    def get_variable_selection_efficient(self, X, threshold):
        '''Select subset of variables (method "efficient").'''

        columns = list(X)

        while (self.correl(X)['coef'].values.max() > threshold):

            # get longformat correlation matrix
            df = self.correl(X)

            # sum up correlation coefficients across all variables for each variable
            df2 = pd.DataFrame(X.corr().apply(sum), columns=['sum']).sort_values(by='sum', ascending=False)
            df2['sum'] = df2['sum']-1

            # get variable with highest cumulated corr coef (top in list)
            maxx = df2.index[0]

            # select all correlation pairs for that maxx
            df_list = df.loc[df.level_0 == maxx,:].sort_values('coef', ascending=False)

            # get correlated vars with corr coef above threshold
            cols = list(df_list.loc[df_list.coef >threshold,'level_1'])  

            # if cols is empty, check the next variable  in df2
            i = 0
            while cols==[]:
                i+=1
                maxx =df2.index[i]
                df_list = df.loc[df.level_0 == maxx,:].sort_values('coef', ascending=False)
                cols = list(df_list.loc[df_list.coef >threshold,'level_1'])  


            # drop cols from X
            X = X.drop(columns=cols)
        
        dropped_cols = set(columns).difference(set(list(X)))
        dicc = {'method':self.method,
                'max corr-coef value': self.correl(X)['coef'].values.max(), 
                'selected columns':list(X),
                'number of selected columns':len(list(X)),
                'dropped columns':dropped_cols,
                'number of dropped columns':len(dropped_cols)}

        return list(X), dicc                
                   
    
    
    def get_unstacked_correlation_dataframe(self, X):
        '''Unstack correlation matrix.'''
        
        corr = abs(X.corr())
        df = corr.unstack().reset_index()
        
        return df

    
    
    def get_cleaned_unstacked_correlation_dataframe(self, df):
        '''Reformat unstacked correlation matrix.'''
        
        df = df.drop(
            index=df.index[df.level_0==df.level_1]
        ).sort_values(0, ascending=False).reset_index(drop=True)
        df.columns = ['level_0', 'level_1', 'coef']
        
        return df

    
    
    def correl(self, X):
        '''Transform regressor matrix X into unstacked and reformatted correlation matrix.'''
        
        df = self.get_unstacked_correlation_dataframe(X)
        df = self.get_cleaned_unstacked_correlation_dataframe(df)
        
        return df   
    
    
    
    def get_variable_selection_correlation(self, X,y, threshold, minimum):
        '''Select subset of variables (method "slowdescent").'''
        
        columns_list = list(X)

        while (self.correl(X)['coef'].values.max() > threshold)&(len(list(X)) >= minimum):

            # get longformat correlation matrix
            df = self.correl(X)

            # for each variable in X, get variable (level_1) with highest correlation coefficient 
            idx = df.groupby('level_0')['coef'].transform(max) == df['coef']
            df_grouped=df[idx].sort_values('coef',ascending=False)

            # kick out any pairs below correlation threshold
            df_grouped=df_grouped[df_grouped['coef']>threshold].sort_values('coef', ascending=False)

            # count number of occurrences of level_1 (only taking into account variables larger than threshold)
            df_count = df_grouped.loc[
                df_grouped['coef']>threshold,['level_1','coef']
            ].groupby('level_1').count().sort_values('coef',ascending=False)

            # get level_1 vars with more than one occurrence
            cols = df_count.index[df_count['coef']>1].values

            if (len(cols) == 0) & (len(df_count['coef']) != 0):
                # check which variable discriminates better between vse and nonvse
                # get variable names of level_0 and level_1
                zero = df_grouped.loc[0,'level_0']
                one = df_grouped.loc[0,'level_1']

                # compare variance, kick out variable with lower variance
                var_zero=np.var(X[zero])
                var_one=np.var(X[one])
                if var_zero > var_one:
                    cols = one
                elif var_one > var_zero: 
                    cols = zero
                else: 
                    varss = [zero, one]
                    a = random.choice([0,1])
                    cols = varss[a]

            # drop cols from X
            X = X.drop(columns=cols)

        dropped_cols = set(columns).difference(set(list(X)))
        dicc = {'method':self.method,
                'max corr-coef value': self.correl(X)['coef'].values.max(), 
                'selected columns':list(X),
                'number of selected columns':len(list(X)),
                'dropped columns':dropped_cols,
                'number of dropped columns':len(dropped_cols)}


        return list(X), dicc

    