# Professional Preprocessing with Pipelines in Python

### 1. Basic pipelines:

In [58]:
import pandas as pd

In [59]:
data = {"Name": ["Anna", "Bob", "Charlie", "Diana", "Eric"],
        "Age": [20, 34, 23, None, 33],
        "Gender": ["f", "m", "m", "f", "m"],
        "Job": ["Programmer", "Writer", "Cook", "Programmer", "Teacher" ]}

In [60]:
df = pd.DataFrame(data)

In [61]:
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


Pre-processing Pipeline:

* Drop Name Feature
* Impute Ages
* Turn Gender into Binary/Numeric
* One Hot Encode Jobs

In [62]:

 # By hand - without pipelines:
    
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
    
    


In [63]:
# Drop Name Feature:

df = df.drop(["Name"], axis=1)
    
    
    
    

In [64]:
df

Unnamed: 0,Age,Gender,Job
0,20.0,f,Programmer
1,34.0,m,Writer
2,23.0,m,Cook
3,,f,Programmer
4,33.0,m,Teacher


In [65]:
# Impute the ages:

imputer = SimpleImputer(strategy="mean")
df["Age"] = imputer.fit_transform(df[["Age"]])

In [66]:
df

Unnamed: 0,Age,Gender,Job
0,20.0,f,Programmer
1,34.0,m,Writer
2,23.0,m,Cook
3,27.5,f,Programmer
4,33.0,m,Teacher


In [67]:
# Numeric Gender
gender_dct = {"m": 0, "f": 1} # define dictionary that says m will be translated to 0 and f to 1. 


In [68]:
df['Gender'] = [gender_dct[g] for g in df['Gender']] # list comprehension

In [76]:
# One Hot Encode Job
encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[['Job']]).toarray() # turn matrix into an array 



In [77]:
column_names = ["Programmer", "Writer", "Cook", "Teacher"]

for i in range (len(matrix.T)):
    df[column_names[i]] = matrix.T[i]
    
    
df = df.drop(['Job'], axis=1)

In [78]:
df

Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,27.5,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


### 2. Using Professional Pipelines:

* Define Estimators - pipelines contain estimators. Estimators contain the function fit, transform, fit.transform e.g., knn classifier
* create classes

CREATE ESTIMATORS for the following:

* Drop Name Feature
* Impute Ages
* Turn Gender into Binary/Numeric
* One Hot Encode Jobs

In [79]:
from sklearn.base import BaseEstimator, TransformerMixin

In [84]:
#create a class:

class NameDropper(BaseEstimator, TransformerMixin):
     def fit(self, X, y=None):
         return self
     
     
     def transform(self, X):
         return X.drop(['Name'], axis=1)
     
     
     
     
     
class AgeImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
         return self
     
    def transform(self, X):
        imputer = SimpleImputer(strategy="mean")
        X['Age'] = imputer.fit_transform(X[['Age']])
        return X
    
    
    
    

class FeatureEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
         return self
     
    def transform(self, X):
        gender_dct = {"m": 0, "f": 1} 
        X['Gender'] = [gender_dct[g] for g in X['Gender']]
        
        
        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[['Job']]).toarray()
        
        column_names = ["Programmer", "Writer", "Cook", "Teacher"]

        for i in range (len(matrix.T)):
            X[column_names[i]] = matrix.T[i]
            
            
        return X.drop(['Job'], axis=1)
        
        
        
         
         
     
         

In [85]:
data = {"Name": ["Anna", "Bob", "Charlie", "Diana", "Eric"],
        "Age": [20, 34, 23, None, 33],
        "Gender": ["f", "m", "m", "f", "m"],
        "Job": ["Programmer", "Writer", "Cook", "Programmer", "Teacher" ]}

df2 = pd.DataFrame(data)

In [86]:
df2

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


In [83]:
# dropper = NameDropper()
# dropper.fit_transform(df2) # removes name 

Unnamed: 0,Age,Gender,Job
0,20.0,f,Programmer
1,34.0,m,Writer
2,23.0,m,Cook
3,,f,Programmer
4,33.0,m,Teacher


In [89]:
dropper = NameDropper()
imp = AgeImputer()
enc = FeatureEncoder()

# Manually:
enc.fit_transform(imp.fit_transform(dropper.fit_transform(df2)))

Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,27.5,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


### Using Sklearn Pipelines for more efficiency

In [90]:
from sklearn.pipeline import Pipeline

In [91]:
# pass a list of tuples. Start with first element (any name), then the class:
pipe = Pipeline([
    ("dropper", NameDropper()),
    ("imputer", AgeImputer()),
    ("encoder", FeatureEncoder())
    
]) 


In [92]:
df2

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


In [93]:
pipe.fit_transform(df2)

Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,27.5,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0
