<a href="https://colab.research.google.com/github/AzadMehedi/Data-Pipeline/blob/main/Professional_Preprocessing_with_Pipelines_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [57]:
import pandas as pd
import numpy as np

In [58]:
# creating dataset

data = {'Name':['Anna','Bob','Charlie','Diana','Eric'],
      'Age':[20, 34, 23, None, 33],
      'Gender':['f', 'm', 'm', 'f', 'm'],
      'Job':['Programmer','Writer','Cook','Programmer','Teacher']}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


# Preprocessing Pipeline
for prediction we can remove the irrelavent columns
- Name Feature
- Impute Ages (Imputing means is the process of handling missing values)
- Turn Gender into Binary or Numeric
-One Hot Encoding   (One Hot Encoding is used for turn the categorical into numerical)

In [59]:

# If we don't want to use pipeline then we have to do these steps in manually or using functions

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Drop Name Feature
df = df.drop(['Name'], axis=1)

# Impute Ages
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])

# Numeric Gender
gender_dict = {'m':0, 'f':1}
df['Gender'] = [gender_dict[g] for g in df['Gender']]

# One Hot Encoder
encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[['Job']]).toarray()

column_names = ['Programmer','Writter','Cook','Teacher']

for i in range(len(matrix.T)):
  df[column_names[i]] = matrix.T[i]

df = df.drop(['Job'], axis=1) 
df

Unnamed: 0,Age,Gender,Programmer,Writter,Cook,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,27.5,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


# Another Way

In [60]:
# creating dataset

data = {'Name':['Anna','Bob','Charlie','Diana','Eric'],
      'Age':[20, 34, 23, None, 33],
      'Gender':['f', 'm', 'm', 'f', 'm'],
      'Job':['Programmer','Writer','Cook','Programmer','Teacher']}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


In [61]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Define the categories
Job = ['Programmer','Writer','Cook','Teacher']

# Define the data to be encoded
data = [['Programmer'], ['Writer'], ['Cook'], ['Programmer'], ['Teacher']]

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(categories=[Job])  # here 'categories' is a parameter of the OneHotEncoder class.

# Fit and transform the data using the encoder
one_hot_matrix = encoder.fit_transform(data)

# Convert one hot matrix to array and create DataFrame
df1 = pd.DataFrame(one_hot_matrix.toarray(), columns=Job)

# Concat the DataFrames
df = pd.concat([df, df1], axis=1)

# Drop original Job column
df = df.drop(['Job'], axis=1)

df = pd.DataFrame(df)
df



Unnamed: 0,Name,Age,Gender,Programmer,Writer,Cook,Teacher
0,Anna,20.0,f,1.0,0.0,0.0,0.0
1,Bob,34.0,m,0.0,1.0,0.0,0.0
2,Charlie,23.0,m,0.0,0.0,1.0,0.0
3,Diana,,f,1.0,0.0,0.0,0.0
4,Eric,33.0,m,0.0,0.0,0.0,1.0


# Another Way

In [62]:
# creating dataset

data = {'Name':['Anna','Bob','Charlie','Diana','Eric'],
      'Age':[20, 34, 23, None, 33],
      'Gender':['f', 'm', 'm', 'f', 'm'],
      'Job':['Programmer','Writer','Cook','Programmer','Teacher']}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


In [63]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Define the categories
Job = ['Programmer', 'Writer', 'Cook', 'Teacher']

# Define the data to be encoded
data = [['Programmer'], ['Writer'], ['Cook'], ['Programmer'], ['Teacher']]

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(categories=[Job])

# Fit and transform the data using the encoder
one_hot_matrix = encoder.fit_transform(data)

# Convert one hot matrix to array
one_hot_array = one_hot_matrix.toarray()

# Create DataFrame with original data and encoded columns
df = pd.DataFrame({
    'Name': ['Anna', 'Bob', 'Charlie', 'Diana', 'Eric'],
    'Age': [20, 34, 23, None, 33],
    'Gender': ['f', 'm', 'm', 'f', 'm'],
    'Programmer': one_hot_array[:, 0],
    'Writer': one_hot_array[:, 1],
    'Cook': one_hot_array[:, 2],
    'Teacher': one_hot_array[:, 3]
})

# Drop the original Job column
# df = df.drop('Job', axis=1)

df = pd.DataFrame(df)
df

Unnamed: 0,Name,Age,Gender,Programmer,Writer,Cook,Teacher
0,Anna,20.0,f,1.0,0.0,0.0,0.0
1,Bob,34.0,m,0.0,1.0,0.0,0.0
2,Charlie,23.0,m,0.0,0.0,1.0,0.0
3,Diana,,f,1.0,0.0,0.0,0.0
4,Eric,33.0,m,0.0,0.0,0.0,1.0


# Now creating Custom Pipeline
# Steps we are following
- Name Feature
- Impute Ages (Imputing means is the process of handling missing values)
- Turn Gender into categorical to Numeric
-One Hot Encoding   (One Hot Encoding is used for turn the categorical into numerical)

In [68]:
# creating dataset

data = {'Name':['Anna','Bob','Charlie','Diana','Eric'],
      'Age':[20, 34, None, None, 33],
      'Gender':['f', 'm', 'm', 'f', 'm'],
      'Job':['Programmer','Writer','Cook','Programmer','Teacher']}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


In [71]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDroper(BaseEstimator, TransformerMixin):
  
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return X.drop(['Name'], axis=1)  

class AgeImputer(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    imputer = SimpleImputer(strategy='mean')
    X['Age'] = imputer.fit_transform(X[['Age']])
    return X

class FeatureEncoder(BaseEstimator, TransformerMixin):
  
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    gender_dict = {'m':0, 'f':1}
    X['Gender'] = [gender_dict[g] for g in X['Gender']]

    # One Hot Encoding
    encoder = OneHotEncoder()
    matrix = encoder.fit_transform(X[['Job']]).toarray()
    
    column_names = ['Programmer','Writer','Cook','Teacher']

    for i in range(len(matrix.T)):
      X[column_names[i]] = matrix.T[i]
    return X.drop(['Job'], axis=1)


droper = NameDroper()
imputer = AgeImputer()
encoder = FeatureEncoder()

encoder.fit_transform(imputer.fit_transform(droper.fit_transform(df)))


Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,29.0,0,1.0,0.0,0.0,0.0
3,29.0,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


- the y=None parameter is included in the fit and transform methods of custom transformers for consistency with the scikit-learn API, but it is typically not used.

 in case of you get error like
 -  SpecificationError: nested renamer is not supported or
 - 'RegexFlag' object has no attribute 'transform'

In [70]:
# creating dataset

data = {'Name':['Anna','Bob','Charlie','Diana','Eric'],
      'Age':[20, 34, None, None, 33],
      'Gender':['f', 'm', 'm', 'f', 'm'],
      'Job':['Programmer','Writer','Cook','Programmer','Teacher']}

df = pd.DataFrame(data)

from sklearn.base import BaseEstimator, TransformerMixin

class NameDroper(BaseEstimator, TransformerMixin):
  
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return X.drop(['Name'], axis=1)  

class AgeImputer(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    imputer = SimpleImputer(strategy='mean')
    X['Age'] = imputer.fit_transform(X[['Age']])
    return X

class FeatureEncoder(BaseEstimator, TransformerMixin):
  
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    gender_dict = {'m':0, 'f':1}
    X['Gender'] = [gender_dict[g] for g in X['Gender']]

    # One Hot Encoding
    encoder = OneHotEncoder()
    matrix = encoder.fit_transform(X[['Job']]).toarray()
    
    column_names = ['Programmer','Writer','Cook','Teacher']

    for i in range(len(matrix.T)):
      X[column_names[i]] = matrix.T[i]

    # Rename columns
    rename_dict = {}
    for col in column_names:
      rename_dict[col] = col.lower().replace(' ', '_')
    
    X.rename(columns=rename_dict, inplace=True)

    return X.drop(['Job'], axis=1)


droper = NameDroper()
imputer = AgeImputer()
encoder = FeatureEncoder()

df_prepared = encoder.fit_transform(imputer.fit_transform(droper.fit_transform(df)))
df_prepared


Unnamed: 0,Age,Gender,programmer,writer,cook,teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,29.0,0,1.0,0.0,0.0,0.0
3,29.0,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0
