The purpose of this notebook is to demonstrate what streaming pipelines will look like. 

```py
DataSelector(
	FeatureUnion(
		[('base', BasePipeline), 
		 ('stream1', StreamPipeline),
		 ('stream2', StreamPipeline)
		]
	)
)
```

In [43]:
from sklearn.datasets import make_regression, make_classification

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection.base import SelectorMixin
from sklearn.utils.validation import check_is_fitted

from sklearn.linear_model import SGDRegressor

import itertools

In [44]:
pdf = pd.DataFrame(np.random.normal(size=(100,100)))

In [45]:
pdf.columns = ['c{}'.format(x) for x in range(100)]

In [46]:
y = np.random.normal(size=100)

In [47]:
pdf.head()

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c90,c91,c92,c93,c94,c95,c96,c97,c98,c99
0,-2.327853,-0.383654,-0.973586,-0.668999,0.451195,1.353481,-0.761436,-0.830951,-2.411253,0.106185,...,-1.473897,-0.258758,-0.563682,0.245549,-0.498022,1.031419,1.679963,-1.51144,-0.506461,-0.640007
1,-0.844418,-0.0986,1.656304,0.072914,-0.998987,0.858702,-0.557182,1.782512,-0.446117,0.924216,...,-0.597906,1.11491,0.710107,0.478986,-0.760304,-0.306229,0.319245,-0.134885,0.712367,-2.4941
2,0.178444,1.089601,-0.13276,-1.268844,-1.063214,-0.765304,-0.662095,-0.99678,0.777021,1.501859,...,-0.680373,-1.211203,-1.558036,-1.049521,0.416086,-1.250368,-1.250168,0.340367,-1.447583,-1.077239
3,-0.123443,0.159941,-0.055948,0.736152,-0.429644,0.655818,1.21128,-1.317256,1.070772,-1.540593,...,-0.49759,0.193897,1.940245,-1.907211,-0.08036,0.376436,0.538809,-1.195341,-0.109133,1.605322
4,-0.611443,-0.816723,-0.104265,-0.652427,0.584967,-0.25431,0.304804,0.202659,1.546658,-1.601647,...,-1.469079,-1.466828,0.976082,0.184223,-1.956396,-0.061867,-1.925231,-0.201085,0.464327,-0.351674


In [48]:
class DataSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        if self.columns is not None:
            return X[self.columns]
        else:
            return X.copy()

In [49]:
testSel = DataSelector(columns=['c0', 'c1'])

In [50]:
testSel.fit_transform(pdf).head()

Unnamed: 0,c0,c1
0,-2.327853,-0.383654
1,-0.844418,-0.0986
2,0.178444,1.089601
3,-0.123443,0.159941
4,-0.611443,-0.816723


In [51]:
# simulate what streaming features might look like
base_df = pdf[['c{}'.format(x) for x in range(20)]]
stream1 = pdf[['c{}'.format(x) for x in range(20,30)]]
stream2 = pdf[['c{}'.format(x) for x in range(30,40)]]

In [52]:
class SGDRegressorTransformer(SGDRegressor, TransformerMixin):
    def transform(self, X):
        return self.predict(X).reshape(-1, 1)

In [53]:
def streaming_pipeline(columns):
    return make_pipeline(
        DataSelector(columns = columns), 
        SGDRegressorTransformer()
    )

In [54]:
base_pipeline = streaming_pipeline(['c{}'.format(x) for x in range(20)])
stream1_pipeline = streaming_pipeline(['c{}'.format(x) for x in range(20,30)])
stream2_pipeline = streaming_pipeline(['c{}'.format(x) for x in range(30,40)])

In [55]:
# combine all the pipeline together...
# using stacking - this will be the boosting
# variant of grafting...
full_pipeline = make_pipeline(
    FeatureUnion([
        ('base', base_pipeline), 
        ('stream1', stream1_pipeline), 
        ('stream2', stream2_pipeline)
    ]), 
    SGDRegressorTransformer())

In [56]:
full_pipeline.fit(pdf, y)
full_pipeline.predict(pdf)

array([-0.03419487, -0.05934672,  0.01442772, -0.02602387,  0.17175713,
        0.22263986, -0.04053778,  0.15631345,  0.08938764, -0.21817169,
       -0.11305714,  0.11946234,  0.13107088,  0.13816118,  0.2476947 ,
        0.0239285 ,  0.03234156, -0.01834933, -0.01123299,  0.00739144,
        0.24455022,  0.06938402, -0.03198   , -0.02920845,  0.2147274 ,
        0.04431022,  0.10640158,  0.17265074,  0.12187168, -0.0589285 ,
        0.19352613,  0.06080698, -0.06099668,  0.10796755,  0.14285722,
       -0.06583749,  0.08818538, -0.09377994, -0.15483254,  0.15889085,
        0.12014473, -0.00500242, -0.02677674,  0.22447759, -0.03454587,
        0.17950569,  0.03685937,  0.05210635,  0.1647257 ,  0.21373417,
       -0.12513481, -0.00190798,  0.16054462,  0.21998268,  0.08399318,
       -0.21877307, -0.14360767,  0.06762807,  0.07015688,  0.03888237,
        0.11264293,  0.09255699, -0.04339565,  0.16097296,  0.01362089,
        0.10804456,  0.10839505,  0.00751351,  0.08033387,  0.11

In [57]:
class GraftingRegressor(BaseEstimator, TransformerMixin):
    def __init__(self, lambda_=0.05):
        """
        lambda_: is the regularizer penalty. This is used to select which columns are kept
                 in the model.
        """
        self.grafting_columns = []
        self.stream_pipeline = []
        self.full_pipeline = []
        self.lambda_ = lambda_
        
    def streaming_pipeline(self, columns):
        return make_pipeline(
            DataSelector(columns = columns), 
            SGDRegressorTransformer()
        )
    
    def _fit(self, X, y=None):
        if not type(X) is pd.core.frame.DataFrame:
            raise Exception("Grafting Classifier can only accept dataframes!")
        
        flat_columns = list(itertools.chain(*self.grafting_columns))
        new_feats = [x for x in list(X.columns) if x not in flat_columns]
        
        idx = len(self.stream_pipeline)
        self.stream_pipeline.append(('stream{}'.format(idx), self.streaming_pipeline(new_feats)))
        self.full_pipeline = make_pipeline(
            FeatureUnion(self.stream_pipeline[:]), 
            SGDRegressorTransformer())
        
    
    def fit(self, X, y=None):
        self._fit(X, y)
        self.full_pipeline.fit(X, y)
        return self
    
    def partial_fit(self, X, y=None):
        self._fit(X, y)
        self.full_pipeline.partial_fit(X, y)
        return self
    
    def predict(self, X):
        return self.full_pipeline.predict(X)

    def transform(self, X):
        return self.predict(X)

In [58]:
gcc  = GraftingRegressor()

In [59]:
gcc.fit(pdf, y)

GraftingRegressor(lambda_=0.05)

In [60]:
gcc.transform(pdf)

array([ 0.19179683, -0.60742052, -0.00920165,  0.38130569,  1.05208714,
        0.1007226 , -0.48735704,  0.31960556,  0.18053738, -0.74860718,
       -0.11486622,  0.64108098,  0.30700003,  0.52888454,  0.42120084,
        0.12707565, -0.30972632,  0.14747264, -0.01489899, -0.90587179,
        0.73468842,  0.20139892,  0.02210034, -0.31356146,  0.16925942,
       -0.00601615,  0.02266016,  0.67614959,  0.64942781, -0.48004621,
        0.80321672,  0.06668287, -0.68843085, -0.40218752,  0.89353432,
       -0.58928666,  0.29323523, -0.1384801 , -0.4851901 , -0.26488388,
        0.11687493, -0.05827873,  0.62314585,  0.38233092, -0.68486886,
        0.14164497,  0.20641961, -0.04218438,  0.73156202,  0.54559164,
       -0.50248031,  0.12541779,  0.63706737,  0.1911822 ,  0.29672445,
       -0.88050758, -0.91790943, -0.12012719, -0.02669789, -0.2479627 ,
        0.1634231 , -0.09886096,  0.01310313, -0.28130584,  0.05376851,
        0.49175918,  0.30921739,  0.34817415,  0.26605157,  0.21

In [61]:
gcc.predict(pdf)

array([ 0.19179683, -0.60742052, -0.00920165,  0.38130569,  1.05208714,
        0.1007226 , -0.48735704,  0.31960556,  0.18053738, -0.74860718,
       -0.11486622,  0.64108098,  0.30700003,  0.52888454,  0.42120084,
        0.12707565, -0.30972632,  0.14747264, -0.01489899, -0.90587179,
        0.73468842,  0.20139892,  0.02210034, -0.31356146,  0.16925942,
       -0.00601615,  0.02266016,  0.67614959,  0.64942781, -0.48004621,
        0.80321672,  0.06668287, -0.68843085, -0.40218752,  0.89353432,
       -0.58928666,  0.29323523, -0.1384801 , -0.4851901 , -0.26488388,
        0.11687493, -0.05827873,  0.62314585,  0.38233092, -0.68486886,
        0.14164497,  0.20641961, -0.04218438,  0.73156202,  0.54559164,
       -0.50248031,  0.12541779,  0.63706737,  0.1911822 ,  0.29672445,
       -0.88050758, -0.91790943, -0.12012719, -0.02669789, -0.2479627 ,
        0.1634231 , -0.09886096,  0.01310313, -0.28130584,  0.05376851,
        0.49175918,  0.30921739,  0.34817415,  0.26605157,  0.21