In [46]:
import pandas as pd

data = {"Name":['Anna','Bob','Charlie','Diana','Eric'],
       "Age": [20,34,23,None,33],
       "Gender": ['f','m','m','f',"m"],
       "Job":["Programmer","Writer","Cook","programmer","Teacher"]}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,programmer
4,Eric,33.0,m,Teacher


## Preprocessing Pipeline:

### Drop irrelevant column
### Impute Missing values
### Turn Gender to Binary / Numeric
### One Hot Encode Jobs

In [47]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Drop Name Feature
df = df.drop(["Name"],axis=1)

# Impute Ages
imputer = SimpleImputer(strategy="mean")
df['Age'] = imputer.fit_transform(df[['Age']])

#Numeric Gender
gender_dict = {"m":0,"f":1}
df['Gender'] = [gender_dict[g] for g in df['Gender']]

# OneHotEncoded Jobs
encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[['Job']]).toarray()

column_names = ["Programmer","Cook","Writer","Teacher"]

for i in range(len(matrix.T)-1):
    df[column_names[i]] = matrix.T[i]
    
df = df.drop(['Job'],axis=1)

In [48]:
df

Unnamed: 0,Age,Gender,Programmer,Cook,Writer,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,27.5,1,0.0,0.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


In [49]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        return X.drop(['Name'],axis=1)

class AgeImputer(BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        imputer = SimpleImputer(strategy="mean")
        X['Age'] = imputer.fit_transform(X[['Age']])
        return X
    
class FeatureEncoder(BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        gender_dict = {"m":0,"f":1}
        X['Gender'] = [gender_dict[g] for g in X['Gender']]

        # OneHotEncoded Jobs
        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[['Job']]).toarray()

        column_names = ["Programmer","Writer","Cook","Teacher"]

        for i in range(len(matrix.T)-1):
            X[column_names[i]] = matrix.T[i]

        return X.drop(['Job'],axis=1)

In [50]:

data = {"Name":['Anna','Bob','Charlie','Diana','Eric'],
       "Age": [20,34,23,None,33],
       "Gender": ['f','m','m','f',"m"],
       "Job":["Programmer","Writer","Cook","programmer","Teacher"]}

df2 = pd.DataFrame(data)

#### Manual Method

##### dropper = NameDropper()
##### imp = AgeImputer()
##### enc = FeatureEncoder()

##### enc.fit_transform(imp.fit_transform(dropper.fit_transform(df2)))

In [55]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("dropper",NameDropper()),
    ("imputer",AgeImputer()),
    ("encoder", FeatureEncoder())
])

pipe.fit_transform(df2)

Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,27.5,1,0.0,0.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


### Another Example on actual Dataset

In [61]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('data/california_housing_train.csv')
test_df = pd.read_csv('data/california_housing_test.csv')

train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [62]:
test_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [68]:
x_train ,y_train = train_df.to_numpy()[:,:-1],train_df.to_numpy()[:,-1]
x_test ,y_test = test_df.to_numpy()[:,:-1],test_df.to_numpy()[:,-1]

print(x_train)
x_train.shape,y_train.shape,x_test.shape,y_test.shape

[[-114.31     34.19     15.     ... 1015.      472.        1.4936]
 [-114.47     34.4      19.     ... 1129.      463.        1.82  ]
 [-114.56     33.69     17.     ...  333.      117.        1.6509]
 ...
 [-124.3      41.84     17.     ... 1244.      456.        3.0313]
 [-124.3      41.8      19.     ... 1298.      478.        1.9797]
 [-124.35     40.54     52.     ...  806.      270.        3.0147]]


((17000, 8), (17000,), (3000, 8), (3000,))

In [74]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from copy import deepcopy


std_scaler = StandardScaler().fit(x_train[:,:2])
min_max_scaler = MinMaxScaler().fit(x_train[:,2:])

def preprocessor(X):
    A = np.copy(X)
    A[:,:2] = std_scaler.transform(X[:,:2])
    A[:,2:] = min_max_scaler.transform(X[:,2:])
    return A

In [75]:
preprocessor(x_test)

array([[-1.24077729e+00,  8.16354338e-01,  5.09803922e-01, ...,
         4.29944785e-02,  9.94902154e-02,  4.21276948e-01],
       [ 6.29446690e-01, -6.38768279e-01,  8.23529412e-01, ...,
         2.25903192e-02,  4.53872718e-02,  2.13728087e-01],
       [ 8.73822623e-01, -8.63353120e-01,  5.09803922e-01, ...,
         4.15090109e-02,  8.12366387e-02,  3.65063930e-01],
       ...,
       [-6.87702626e-02,  3.15717296e-01,  1.76470588e-01, ...,
         1.93391070e-02,  3.60138135e-02,  1.23418987e-01],
       [ 1.21794383e+00, -7.13629892e-01,  7.64705882e-01, ...,
         1.20519073e-03,  2.13780628e-03,  1.91093916e-01],
       [-3.38594150e-02, -5.63906665e-01,  8.03921569e-01, ...,
         2.10207685e-02,  4.25916790e-02,  5.55916470e-01]])

In [76]:
x_test #this is not changed

array([[-122.05  ,   37.37  ,   27.    , ..., 1537.    ,  606.    ,
           6.6085],
       [-118.3   ,   34.26  ,   43.    , ...,  809.    ,  277.    ,
           3.599 ],
       [-117.81  ,   33.78  ,   27.    , ..., 1484.    ,  495.    ,
           5.7934],
       ...,
       [-119.7   ,   36.3   ,   10.    , ...,  693.    ,  220.    ,
           2.2895],
       [-117.12  ,   34.1   ,   40.    , ...,   46.    ,   14.    ,
           3.2708],
       [-119.63  ,   34.42  ,   42.    , ...,  753.    ,  260.    ,
           8.5608]])

In [77]:
preprocessor_transformer = FunctionTransformer(preprocessor)
preprocessor_transformer

FunctionTransformer(func=<function preprocessor at 0x00000152FC212670>)

In [78]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

p1 = Pipeline([('Scaler',preprocessor_transformer),
                ('Linear Regression', LinearRegression())])
p1

Pipeline(steps=[('Scaler',
                 FunctionTransformer(func=<function preprocessor at 0x00000152FC212670>)),
                ('Linear Regression', LinearRegression())])

In [79]:
from sklearn.metrics import mean_absolute_error

def fit_and_print(p,x_train = x_train , y_train = y_train , x_test = x_test,y_test = y_test):
    p.fit(x_train,y_train)
    train_preds = p.predict(x_train)
    test_preds = p.predict(x_test)
    print('Training error: ' + str(mean_absolute_error(train_preds, y_train)))
    print('Test error: '+str(mean_absolute_error(test_preds, y_test)))

In [80]:
fit_and_print(p1)

Training error: 50795.85711786371
Test error: 50352.228257942894


In [81]:
from sklearn.neighbors import KNeighborsRegressor as KNR

p2 = Pipeline([('Scaler',preprocessor_transformer),
                ('Linear Regression', KNR(n_neighbors=7))])
fit_and_print(p2)

Training error: 30045.80900840336
Test error: 35865.41276190476
