
# Intro to (data) pipelines

We have already learnt how to import libraries:

```python
import libname # or
import libname as lib
```
then you can use

```
lib.func()  # to access the specific function you want. 
```

If you want to import just one function:

```python
from libname import func
```
* DO NOT USE:
```python
from libname import * #this might have unwanted consequences
```

In [1]:
# good 
import pandas as pd
# bad:
# from pandas import *
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import FunctionTransformer

 read data from csv and basic pandas commands

In [2]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data',
                   delim_whitespace=True,
                  names = ['mpg',          # continuous
                          'cylinders',     # multi-valued discrete
                          'displacement',  # continuous
                          'horsepower',    # continuous
                          'weight',        # continuous
                          'acceleration',  # continuous
                          'model_year',    # multi-valued discrete
                          'origin',        # multi-valued discrete
                          'name',          # string (unique for each instance))
                          ],
                  na_values = '?',
                  )

target = data['weight']

In [3]:
# you can select one column such as 
data['mpg']
# or 
data.loc[:, 'mpg']
# several columns as
data[['mpg','horsepower']]

data.loc[:, ['mpg','horsepower']]

# select using boolean mask 
mask = np.random.choice([True, False], size=len(data), p=[0.05, 0.95])
data.loc[mask, ['mpg','horsepower']]

mask = data['mpg'] > 20
data.loc[mask, ['mpg','horsepower']].mean()
# or more useful # na 
mask = data.isna().any(axis=1)
data.loc[mask, :]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
32,25.0,4,98.0,,2046.0,19.0,71,1,ford pinto
126,21.0,6,200.0,,2875.0,17.0,74,1,ford maverick
330,40.9,4,85.0,,1835.0,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,,2905.0,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,,2320.0,15.8,81,2,renault 18i
374,23.0,4,151.0,,3035.0,20.5,82,1,amc concord dl


In [4]:
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [5]:
# Vectorization in python (do not write for loops)
# https://www.oreilly.com/library/view/python-for-data/9781449323592/ch04.html

array = np.random.randint(1, 10, size = 10000)

In [6]:
dum_data = pd.DataFrame([[-1, 2, 'a'], [-0.5, 6, 'a'], [0, 10, 'b'], [1, 18, 'b']], columns = ['a','b','c'])
y = np.array([1,2,3,4])

In [8]:
# standartize (z-score)

X = dum_data[['a','b']].copy()
X = X - X.mean()
X = X/X.std(ddof=0)

In [9]:
X

Unnamed: 0,a,b
0,-1.183216,-1.183216
1,-0.507093,-0.507093
2,0.169031,0.169031
3,1.521278,1.521278


In [10]:
def standartize(X):
    return (X - X.mean() )/ X.std(ddof=0)

X = standartize(dum_data[['a','b']].copy())
print(X)

          a         b
0 -1.183216 -1.183216
1 -0.507093 -0.507093
2  0.169031  0.169031
3  1.521278  1.521278


In [11]:
X = dum_data[['a','b']].copy()

In [12]:
skscaler = StandardScaler()

skscaler.fit(X)
skscaler.transform(X)

skscaler.fit_transform(X)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [13]:
from sklearn.preprocessing import PolynomialFeatures

X = dum_data[['a','b']].copy()

poly = PolynomialFeatures(include_bias = False)
poly.fit_transform(X)

array([[-1.00e+00,  2.00e+00,  1.00e+00, -2.00e+00,  4.00e+00],
       [-5.00e-01,  6.00e+00,  2.50e-01, -3.00e+00,  3.60e+01],
       [ 0.00e+00,  1.00e+01,  0.00e+00,  0.00e+00,  1.00e+02],
       [ 1.00e+00,  1.80e+01,  1.00e+00,  1.80e+01,  3.24e+02]])

In [14]:
#%% Lets create transformers ourselves
# or what to do if we need something that is not in sklearn
from sklearn.utils import check_array
from sklearn.base import TransformerMixin, BaseEstimator

# classes and interfaces for more info:
# https://scikit-learn.org/stable/developers/develop.html
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
# Custom transformers

class CustomStandardScaler(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        self.means = np.mean(X, axis = 0)
        self.vars  = np.var(X, axis = 0)
        self.scale = np.sqrt(self.vars)
        
        return self
    
    def transform(self, X):
        X = X - self.means
        X = X/self.scale
        return X

In [15]:
skscaler = StandardScaler()
skscaler.fit_transform(X)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [19]:
custmsc = CustomStandardScaler()
custmsc.fit_transform(X)

Unnamed: 0,a,b
0,-1.183216,-1.183216
1,-0.507093,-0.507093
2,0.169031,0.169031
3,1.521278,1.521278


In [20]:
class ColSelector(TransformerMixin, BaseEstimator):
    """selects columns from pandas dataframe
    """
    def __init__(self, columns):
        pass    
        
    def fit():
        pass
    
    def transform():
        pass

class CustomTranformer(TransformerMixin, BaseEstimator):
    
    def fit(self, X, y=None):
        pass
    
    def transform(self, X):
        pass

In [21]:
# 
from sklearn.dummy import DummyRegressor, DummyClassifier

poly = PolynomialFeatures(include_bias = False)
skscaler = StandardScaler()

X_poly = poly.fit_transform(X)
X_scaled = skscaler.fit_transform(X_poly)
X_scaled

array([[-1.18321596, -1.18321596,  0.98019606, -0.61159284, -0.89625816],
       [-0.50709255, -0.50709255, -0.70014004, -0.72808671, -0.6401844 ],
       [ 0.16903085,  0.16903085, -1.26025208, -0.37860509, -0.12803688],
       [ 1.52127766,  1.52127766,  0.98019606,  1.71828464,  1.66447944]])

In [26]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LinearRegression

pipe = Pipeline([ ('poly', PolynomialFeatures(include_bias = False) ) ,
                  ('scaler', StandardScaler() ),
                 # ('custom', CustomTranformer() )
                  ('model', DummyRegressor() ),
                ]
                )

# pipe.fit_transform(X)

pipe.fit(X, y)
pipe.predict(X)

array([2.5, 2.5, 2.5, 2.5])

In [27]:
dum_data['c']

0    a
1    a
2    b
3    b
Name: c, dtype: object

In [28]:
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder(sparse = False, handle_unknown='ignore')
onehot.fit_transform(dum_data[['c']])

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [29]:
baby_pipe = FeatureUnion([
                        ('numerical',
                        Pipeline([
                            ('select_num', FunctionTransformer(func = lambda X: X.loc[:, ['a','b']])),
                            ('poly',       PolynomialFeatures(include_bias = False)),
                            ('scaler',     StandardScaler()),
                                ])
                        ),
                         ('categorical', 
                          Pipeline([
                              ('pass_cat', FunctionTransformer(func = lambda X: X.loc[:, ['c']]) ),
                              ('onehot',   OneHotEncoder(sparse = False, handle_unknown='ignore') )
                        ] ) 
                         ),
                    ])

In [30]:
super_pipe = Pipeline([ ('baby_pipe', baby_pipe),
                        # ('model', LinearRegression() ),
                         ('model', DummyRegressor() )
                        ])

super_pipe.fit(dum_data, y)
super_pipe.predict(dum_data)

array([2.5, 2.5, 2.5, 2.5])

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)


In [33]:
X_train

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
245,36.1,4,98.0,66.0,1800.0,14.4,78,1,ford fiesta
110,22.0,4,108.0,94.0,2379.0,16.5,73,3,datsun 610
16,18.0,6,199.0,97.0,2774.0,15.5,70,1,amc hornet
66,17.0,8,304.0,150.0,3672.0,11.5,72,1,amc ambassador sst
153,18.0,6,250.0,105.0,3459.0,16.0,75,1,chevrolet nova
...,...,...,...,...,...,...,...,...,...
71,19.0,3,70.0,97.0,2330.0,13.5,72,3,mazda rx2 coupe
106,12.0,8,350.0,180.0,4499.0,12.5,73,1,oldsmobile vista cruiser
270,21.1,4,134.0,95.0,2515.0,14.8,78,3,toyota celica gt liftback
348,37.7,4,89.0,62.0,2050.0,17.3,81,3,toyota tercel


In [34]:
X_test

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
198,33.0,4,91.0,53.0,1795.0,17.4,76,3,honda civic
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger
33,19.0,6,232.0,100.0,2634.0,13.0,71,1,amc gremlin
208,13.0,8,318.0,150.0,3940.0,13.2,76,1,plymouth volare premier v8
93,14.0,8,318.0,150.0,4237.0,14.5,73,1,plymouth fury gran sedan
...,...,...,...,...,...,...,...,...,...
378,38.0,4,105.0,63.0,2125.0,14.7,82,1,plymouth horizon miser
371,29.0,4,135.0,84.0,2525.0,16.0,82,1,dodge aries se
280,21.5,6,231.0,115.0,3245.0,15.4,79,1,pontiac lemans v6
323,27.9,4,156.0,105.0,2800.0,14.4,80,1,dodge colt


In [35]:
y_train

245    1800.0
110    2379.0
16     2774.0
66     3672.0
153    3459.0
        ...  
71     2330.0
106    4499.0
270    2515.0
348    2050.0
102    1950.0
Name: weight, Length: 298, dtype: float64