The purpose of this notebook is to demonstrate what streaming pipelines will look like. 

```py
DataSelector(
	FeatureUnion(
		[('base', BasePipeline), 
		 ('stream1', StreamPipeline),
		 ('stream2', StreamPipeline)
		]
	)
)
```

In [95]:
from sklearn.datasets import make_regression, make_classification

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection.base import SelectorMixin
from sklearn.utils.validation import check_is_fitted

from sklearn.linear_model import SGDRegressor

In [96]:
pdf = pd.DataFrame(np.random.normal(size=(100,100)))

In [97]:
pdf.columns = ['c{}'.format(x) for x in range(100)]

In [98]:
y = np.random.normal(size=100)

In [99]:
pdf.head()

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c90,c91,c92,c93,c94,c95,c96,c97,c98,c99
0,1.465552,-0.174947,-0.557285,-1.864667,-0.453841,-0.015839,1.032664,2.289818,-0.977157,0.798384,...,-1.089052,0.099207,-0.279303,1.492805,-0.562735,-0.279601,0.27992,-1.203971,-0.700515,-0.623444
1,0.503198,1.045425,0.069621,-0.828206,0.624801,-0.673744,-0.100857,-0.131911,0.144918,-1.058873,...,-1.026048,-2.785998,0.101738,1.833124,1.991244,0.648577,0.420932,-1.211033,-0.007652,0.273093
2,1.89492,-0.502263,-0.181144,-0.315629,-0.476192,-0.039789,0.252323,0.004571,-0.736486,0.233061,...,0.359855,-2.239837,-0.356782,1.280595,0.682922,0.187419,0.332395,-0.441433,1.162203,-0.915109
3,1.533769,-0.612605,0.992457,0.806356,-0.949315,2.489005,-0.996943,0.172389,1.839823,-0.68011,...,-0.75158,-0.005312,1.012568,-2.575455,0.016905,-0.491907,0.810994,0.232799,-0.63868,1.280433
4,-0.903803,1.185339,1.017459,-2.417675,-0.73003,1.266846,0.744343,-0.516313,0.315573,0.577418,...,-0.80482,-0.126032,0.637179,-2.863028,-0.48831,0.240614,-0.715193,2.398261,-0.662515,1.0473


In [100]:
class DataSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        if self.columns is not None:
            return X[self.columns]
        else:
            return X.copy()

In [101]:
testSel = DataSelector(columns=['c0', 'c1'])

In [102]:
testSel.fit_transform(pdf).head()

Unnamed: 0,c0,c1
0,1.465552,-0.174947
1,0.503198,1.045425
2,1.89492,-0.502263
3,1.533769,-0.612605
4,-0.903803,1.185339


In [103]:
# simulate what streaming features might look like
base_df = pdf[['c{}'.format(x) for x in range(20)]]
stream1 = pdf[['c{}'.format(x) for x in range(20,30)]]
stream2 = pdf[['c{}'.format(x) for x in range(30,40)]]

In [104]:
class SGDRegressorTransformer(SGDRegressor, TransformerMixin):
    def transform(self, X):
        return self.predict(X).reshape(-1, 1)

In [105]:
def streaming_pipeline(columns):
    return make_pipeline(
        DataSelector(columns = columns), 
        SGDRegressorTransformer()
    )

In [106]:
base_pipeline = streaming_pipeline(['c{}'.format(x) for x in range(20)])
stream1_pipeline = streaming_pipeline(['c{}'.format(x) for x in range(20,30)])
stream2_pipeline = streaming_pipeline(['c{}'.format(x) for x in range(30,40)])

In [107]:
# combine all the pipeline together...
# using stacking...
full_pipeline = make_pipeline(
    FeatureUnion([
        ('base', base_pipeline), 
        ('stream1', stream1_pipeline), 
        ('stream2', stream2_pipeline)
    ]), 
    SGDRegressorTransformer())

In [108]:
full_pipeline.fit(pdf, y)
full_pipeline.predict(pdf)

array([-0.01046794,  0.23922794,  0.15621053,  0.18733299,  0.16227597,
        0.13367772,  0.14907194,  0.26653494,  0.22393387,  0.25278151,
        0.09824282,  0.01539261,  0.21714711,  0.0168649 ,  0.11493144,
        0.25865574, -0.03777664,  0.184544  ,  0.1708652 ,  0.16301401,
        0.11245656, -0.15420115,  0.06638265,  0.0435486 ,  0.23664126,
        0.26327164,  0.19678683,  0.35822972,  0.1801091 ,  0.16700287,
        0.11663264, -0.06049385,  0.32435937,  0.01853535,  0.17809397,
        0.23245753,  0.1560194 ,  0.05948834,  0.29124497,  0.22004698,
       -0.04410663,  0.14236225,  0.12377987,  0.20637208,  0.20046297,
       -0.00397816,  0.12553085,  0.09607942,  0.30506073,  0.00214134,
        0.05366545,  0.12568052,  0.1880509 ,  0.16519193,  0.22024035,
        0.17814763,  0.09776498,  0.18493481,  0.18445454,  0.13797395,
        0.26405709,  0.06221952,  0.26442711, -0.02111403,  0.16552624,
        0.06590987,  0.11033476,  0.15571371,  0.03521425,  0.14