In [1]:
import sklearn

In [2]:
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor

import pandas as pd
import numpy as np

In [3]:
X, y = make_regression()
pdf = pd.DataFrame(X)
pdf.columns = ['c{}'.format(x) for x in range(100)]

In [4]:
X.shape

(100, 100)

In [5]:
X1 = pdf[['c{}'.format(x) for x in range(50, 100)]]
X2 = pdf[['c{}'.format(x) for x in range(50)]]

In [6]:
class GraftingRegressor(SGDRegressor):
    def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
                 l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
                 shuffle=True, verbose=0, epsilon=0.1,
                 random_state=None, learning_rate="invscaling", eta0=0.01,
                 power_t=0.25, warm_start=False, average=False, n_iter=None, reg_penalty=None):
        super(GraftingRegressor, self).__init__(loss=loss, penalty=penalty,
                                           alpha=alpha, l1_ratio=l1_ratio,
                                           fit_intercept=fit_intercept,
                                           max_iter=max_iter, tol=tol,
                                           shuffle=shuffle,
                                           verbose=verbose,
                                           epsilon=epsilon,
                                           random_state=random_state,
                                           learning_rate=learning_rate,
                                           eta0=eta0, power_t=power_t,
                                           warm_start=warm_start,
                                           average=average, n_iter=n_iter)
        self.filter_cols = []
        self.base_shape = None
        self.reg_penalty = reg_penalty if reg_penalty is not None else l1_ratio
    
    def _fit_columns(self, X, return_x=True):
        """
        Method filter through "unselected" columns. The goal of this 
        method is to filter any uninformative columns.
        
        This will be selected based on index only?
        
        If return_x is false, it will only return the boolean mask.
        """
        import pandas
        bool_mask = np.ones((X.shape[1],), dtype=np.bool)
        if len(self.filter_cols) == 0:
            if return_x:
                return X
            else:
                return bool_mask
        # otherwise...
        bool_mask[self.filter_cols] = False
        if not return_x:
            return bool_mask
        if type(X) is pandas.core.frame.DataFrame:
            return X[X.columns[bool_mask]]
        else:
            return X[:, bool_mask]
    
    def _reg_penalty(self):
        bool_mask = np.zeros((self.coef_.shape[0],), dtype=np.bool)
        keep_cols = np.argwhere(np.abs(self.coef_) > self.reg_penalty)        
        mask = np.array(list(set(keep_cols.flatten().tolist() + list(range(self.base_shape)))))
        self.coef_ = self.coef_[mask]
        bool_mask[mask] = True
        self.filter_cols = np.argwhere(~bool_mask).flatten().tolist()        
    
    def _partial_grafting_fit(self, X, y):
        """
        Partial fit grafting method to expand the coefficient listing
        to taking into account new coefficients
        """
        # require to know the base shape to determine/
        # check for irrelevant columns in the future.
        self.base_shape = self.coef_.shape[0]
        
        X = self._fit_columns(X)
        n_samples, n_features = X.shape
        coef_list = np.zeros(n_features, dtype=np.float64, order="C")
        coef_list[:self.coef_.shape[0]] = self.coef_.copy()
        self.coef_ = coef_list.copy()
        
    def partial_fit(self, X, y, sample_weight=None):
        self._partial_grafting_fit(X, y)
        super(GraftingRegressor, self).partial_fit(X, y, sample_weight=None)  
        
        # update parameters based on weight of regularizer penalty
        self._reg_penalty()
        return self
    
    def predict(self, X):
        X = self._fit_columns(X)
        return super(GraftingRegressor, self).predict(X)        

In [7]:
model = GraftingRegressor(max_iter=1000)
model.fit(X1, y)

GraftingRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
         fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
         loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
         power_t=0.25, random_state=None, reg_penalty=0.15, shuffle=True,
         tol=None, verbose=0, warm_start=False)

In [8]:
len(model.coef_)

50

In [9]:
model.partial_fit(pdf, y)

GraftingRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
         fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
         loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
         power_t=0.25, random_state=None, reg_penalty=0.15, shuffle=True,
         tol=None, verbose=0, warm_start=False)

In [10]:
len(model.coef_)

89

In [11]:
model.predict(pdf)

array([-114.43971872,   -2.74621982,    8.78567954,  -21.94898466,
        -44.51252186,  -74.56822503, -117.05766613,  120.64896879,
       -156.00418368,    8.37154169,   24.1505489 ,   -8.38998533,
         37.53529908,  109.02879654, -257.51148353,  -22.6802784 ,
        -86.06796807,  -40.64576564,  -32.40757129,  138.73804528,
         94.49303679,   66.11147198,   62.18854392,   64.88996926,
        146.8258968 ,   -5.88583302,   61.27349625,  -87.66576157,
        166.9204851 ,  101.60153914,  168.66389505,   63.60562505,
        -28.22730543,   42.47326142,  172.80092979,  -42.58925289,
        124.11914053,  -50.43591372,   32.35131454,   29.01065903,
        122.10597674,   30.81761033,  137.68123206,  -39.04492819,
         70.13353093,   95.6700908 , -195.58640197,   81.70611645,
        -28.66193324,   -8.64604973,   20.19410666,   41.92081642,
         90.93855988,  143.6976966 ,  -13.58232536,  161.64478059,
         31.50291865,   19.10969744,  -91.75938233,  116.45499

In [12]:
"""
Implement DPP version that is similar to what is done above
"""

class DPPRegressor(SGDRegressor):
    def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
                 l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
                 shuffle=True, verbose=0, epsilon=0.1,
                 random_state=None, learning_rate="invscaling", eta0=0.01,
                 power_t=0.25, warm_start=False, average=False, n_iter=None, reg_penalty=None):
        super(GraftingRegressor, self).__init__(loss=loss, penalty=penalty,
                                           alpha=alpha, l1_ratio=l1_ratio,
                                           fit_intercept=fit_intercept,
                                           max_iter=max_iter, tol=tol,
                                           shuffle=shuffle,
                                           verbose=verbose,
                                           epsilon=epsilon,
                                           random_state=random_state,
                                           learning_rate=learning_rate,
                                           eta0=eta0, power_t=power_t,
                                           warm_start=warm_start,
                                           average=average, n_iter=n_iter)
        self.filter_cols = []
        self.base_shape = None
        self.reg_penalty = reg_penalty if reg_penalty is not None else l1_ratio
    
    def _fit_columns(self, X, return_x=True):
        """
        Method filter through "unselected" columns. The goal of this 
        method is to filter any uninformative columns.
        
        This will be selected based on index only?
        
        If return_x is false, it will only return the boolean mask.
        """
        import pandas
        bool_mask = np.ones((X.shape[1],), dtype=np.bool)
        if len(self.filter_cols) == 0:
            if return_x:
                return X
            else:
                return bool_mask
        # otherwise...
        bool_mask[self.filter_cols] = False
        if not return_x:
            return bool_mask
        if type(X) is pandas.core.frame.DataFrame:
            return X[X.columns[bool_mask]]
        else:
            return X[:, bool_mask]
    
    def _reg_penalty(self):
        bool_mask = np.zeros((self.coef_.shape[0],), dtype=np.bool)
        keep_cols = np.argwhere(np.abs(self.coef_) > self.reg_penalty)        
        mask = np.array(list(set(keep_cols.flatten().tolist() + list(range(self.base_shape)))))
        self.coef_ = self.coef_[mask]
        bool_mask[mask] = True
        self.filter_cols = np.argwhere(~bool_mask).flatten().tolist()        
    
    def _partial_grafting_fit(self, X, y):
        """
        Partial fit grafting method to expand the coefficient listing
        to taking into account new coefficients
        """
        # require to know the base shape to determine/
        # check for irrelevant columns in the future.
        self.base_shape = self.coef_.shape[0]
        
        X = self._fit_columns(X)
        n_samples, n_features = X.shape
        coef_list = np.zeros(n_features, dtype=np.float64, order="C")
        coef_list[:self.coef_.shape[0]] = self.coef_.copy()
        self.coef_ = coef_list.copy()
        
    def partial_fit(self, X, y, sample_weight=None):
        self._partial_grafting_fit(X, y)
        super(GraftingRegressor, self).partial_fit(X, y, sample_weight=None)  
        
        # update parameters based on weight of regularizer penalty
        self._reg_penalty()
        return self
    
    def predict(self, X):
        X = self._fit_columns(X)
        return super(GraftingRegressor, self).predict(X)        