# Santander Model Selection Pipelines

#### Create Pipelines for each preprocessing technique


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from dlxtools.preprocessing import PandasRobustScaler

In [2]:
#Doesn't have to be a transformer as no information bleeding through process.
class ConstantFeatureDropper(TransformerMixin, BaseEstimator):
    """
    Transformer drops features from DataFrame that 
    """
    
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        
        #Isolate numerical columns (in secom this is all)
        numerical_columns = X.select_dtypes([np.number]).columns
        
        #calculatet the standard deviation of numerical columns
        standard_deviation = X[numerical_columns].std()
        
        #Indicate which columns have no standard deviation
        self.columns_to_drop = standard_deviation[standard_deviation == 0].index           
        
        return self
    
    def transform(self, X):
        return X.drop(self.columns_to_drop, axis = 'columns')

In [14]:
class PCAVarThreshSelector(PCA):
    """
    Description
    -----------
    Selects the columns that can explain a certain percentage of the variance in a data set
    
    Authors
    -------
    Eden Trainor
    
    Notes
    -----
    1. PCA has a principole component limit of 4459 components, no matter how many more features you put into
    it this is a hrad limit of how many components it will return to you.
  
    """
    
    def __init__(self, 
                 n_components=None, 
                 copy=True, 
                 whiten=False, 
                 svd_solver='auto', 
                 tol=0.0, 
                 iterated_power='auto', 
                 random_state=None, 
                 explained_variance_thresh = 0.8):
        
        
        super().__init__(n_components, 
                         copy, 
                         whiten, 
                         svd_solver, 
                         tol, 
                         iterated_power, 
                         random_state)

        
        #Set threshold
        self.explained_variance_thresh = explained_variance_thresh
        
        #Check threshold is in valid range
        if not (0 < explained_variance_thresh <= 1):
            raise (ValueError('explained_variance_thresh must be between 0 and 1 (default 0.8), '.format(
                explained_variance_thresh)))
            
        
        
    def find_nearest_index(self, array, value):
        """
        Description
        -----------
        Finds the index of the coefficient in an array nearest a certain value.
        
        
        Args
        ----
        array: np.ndarray, (number_of_componants,)
            Array containing coeffficients 
        
        value: int,
            Index of coefficient in array closset to this value is found.
        
        
        Returns
        -------
        index: int,
            Index of coefficient in array closest to value.
        """
               
        index = (np.abs(array - value)).argmin()
        
        print('{}: {} features are needed to explain {:.3f}% of the variance in the data.'.format(
            self.__class__, 
            index, 
            array[index]*100))
        
        return index
    
        
    def fit(self, X, y = None):
        """
        Description
        -----------
        Fits the PCA and calculates the index threshold index of the cumulative explained variance ratio array.
        
        
        Args
        ----
        X: DataFrame, (examples, features)
            Pandas DataFrame containing training example features
            
        y: array/DataFrame, (examples,)
            (Optional) Training example labels
        
        Returns
        -------
        self: PCAVarThreshSelector instance
            Returns transfromer instance with fitted instance variables on training data.
        """
        
        assert isinstance(X, pd.DataFrame), 'input isn\'t pandas DataFrame'
        
        #PCA fit the dataset
        super().fit(X)
        
        #Get the cumulative explained variance ratio array (ascending order of cumulative variance explained)
        cumulative_EVR = self.explained_variance_ratio_.cumsum()
        
        #Finds the index corresponding to the threshold amount of variance explained
        self.indx = self.find_nearest_index(array = cumulative_EVR, 
                                            value = self.explained_variance_thresh)
        
        
        return self
    
    def transform(self, X):
        """
        Description
        -----------        
        Selects all the principle components up to the threshold variance.
        
        
        Args
        ----
        X: DataFrame, (examples, features)
            Pandas DataFrame containing training example features


        Returns
        -------
        self: np.ndarray, (examples, indx)
            Array containing the minimum number of principle componants required by explained_variance_thresh.
        """
        
        assert isinstance(X, pd.DataFrame)
        
        #Trnasform data into principal componant mode
        all_components =  super().transform(X)
        
        
        
        return pd.DataFrame(all_components[:, :self.indx], index = X.index)
    
    def fit_transform(self, X, y = None):
        """
        Description
        -----------
        Combines fit and transform methods. 
        This is especially required in this class to overwrite the fit_transform in PCA as fit method not called in 
        PCA fit_transform method.
        
        Args
        ----
        X: DataFrame, (examples, features)
            Pandas DataFrame containing training example features
            
        y: array/DataFrame, (examples,)
            (Optional) Training example labels            
        
        Returns
        -------
        self: np.ndarray, (examples, indx)
            Array containing the minimum number of principle componants required by explained_variance_thresh.
        """
                            
        return self.fit(X, y).transform(X)
        

#### Build the Pipelines

In [11]:
cleaning_pipe =  Pipeline([('cfd', ConstantFeatureDropper())])                #Drops constant columns

pre_processing_pipes = Pipeline([('cp', cleaning_pipe),
                       ('rs', PandasRobustScaler()),
                       ('pcavts', PCAVarThreshSelector())])
                       
lin_reg_kwargs = {}


model_pipe = Pipeline([
    ('ppp', Pipeline([
        ('cp', cleaning_pipe),
        ('rs', PandasRobustScaler()),
        ('pcavts', PCAVarThreshSelector(whiten = True))
    ])),
    ('lin_reg', LinearRegression(**lin_reg_kwargs))
])

### Test Pipelines

Make a fake regressional dataset:

In [6]:
X, y = make_regression(n_samples = 5000, n_features = 4000, effective_rank = 150, noise = 0.25, )

X = pd.DataFrame(X)
y = pd.Series(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

Fit and score pipeline to check it's working:

In [15]:
model_pipe.fit(X_train, y_train)

<class '__main__.PCAVarThreshSelector'>: 836 features are needed to explain 79.99969925387471% of the variance in the data.


Pipeline(memory=None,
     steps=[('ppp', Pipeline(memory=None,
     steps=[('cp', Pipeline(memory=None, steps=[('cfd', ConstantFeatureDropper())])), ('rs', PandasRobustScaler(copy=True, quantile_range=(25.0, 75.0),
          with_centering=True, with_scaling=True)), ('pcavts', PCAVarThreshSelector(copy=True, explained_varian...rue))])), ('lin_reg', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [16]:
model_pipe.score(X_test, y_test)

0.5395312685166831