<a href="https://colab.research.google.com/github/ArmanVasilovich/self_learning/blob/main/sklearn_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sklearn preprocessing

In [None]:
import pandas as pd

In [None]:
data = {
    'Name': ['Anna', 'Bob', 'Charlie', 'Diana', 'Eric'],
    'Age': [20, 34, 23, None, 33],
    'Gender': ['f', 'm', 'm', 'f', 'm'],
    'Job': ['Programmer', 'Writer', 'Cook', 'Programmer', 'Teacher']
}
df = pd.DataFrame(data)

In [None]:
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Drop Name Feature
df = df.drop(['Name'], axis=1)

# Impute Ages
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])

# Numeric Gender
gender_dct = {'m': 0, 'f': 1}
df['Gender'] = [gender_dct[g] for g in df['Gender']]

# OneHotEncode Jobs
encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[['Job']]).toarray()

column_names = ['Programmer', 'Writer', 'Cook', 'Teacher']

for i in range(len(matrix.T)):
  df[column_names[i]] = matrix.T[i]

df = df.drop(['Job'], axis=1)

In [None]:
df

Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,27.5,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


## Новый раздел

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return X.drop(['Name'], axis=1)


class AgeImputer(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    imputer = SimpleImputer(strategy='mean')
    X['Age'] = imputer.fit_transform(X[['Age']])
    return X

class FeatureEncoder(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    gender_dct = {'m': 0, 'f': 1}
    X['Gender'] = [gender_dct[g] for g in X['Gender']]

    encoder = OneHotEncoder()
    matrix = encoder.fit_transform(X[['Job']]).toarray()

    column_names = ['Programmer', 'Writer', 'Cook', 'Teacher']

    for i in range(len(matrix.T)):
      X[column_names[i]] = matrix.T[i]

    return  X.drop(['Job'], axis=1)

In [None]:
data = {
    'Name': ['Fiona', 'Gerald', 'Hans', 'Isabella', 'Jacob'],
    'Age': [20, 34, None, None, 33],
    'Gender': ['f', 'm', 'm', 'f', 'm'],
    'Job': ['Writer', 'Programmer', 'Programmer', 'Programmer', 'Teacher']
}

df2 = pd.DataFrame(data)

In [None]:
df2

Unnamed: 0,Name,Age,Gender,Job
0,Fiona,20.0,f,Writer
1,Gerald,34.0,m,Programmer
2,Hans,,m,Programmer
3,Isabella,,f,Programmer
4,Jacob,33.0,m,Teacher


In [None]:
dropper = NameDropper()
imp = AgeImputer()
enc = FeatureEncoder()

enc.fit_transform(imp.transform(dropper.transform(df2)))

Unnamed: 0,Age,Gender,Programmer,Writer,Cook
0,20.0,1,0.0,0.0,1.0
1,34.0,0,1.0,0.0,0.0
2,29.0,0,1.0,0.0,0.0
3,29.0,1,1.0,0.0,0.0
4,33.0,0,0.0,1.0,0.0


In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('dropper', NameDropper()),
    ('imputer', AgeImputer()),
    ('encoder', FeatureEncoder())
])


df2

Unnamed: 0,Name,Age,Gender,Job
0,Fiona,20.0,f,Writer
1,Gerald,34.0,m,Programmer
2,Hans,,m,Programmer
3,Isabella,,f,Programmer
4,Jacob,33.0,m,Teacher


In [None]:
pipe.fit_transform(df2)

Unnamed: 0,Age,Gender,Programmer,Writer,Cook
0,20.0,1,0.0,0.0,1.0
1,34.0,0,1.0,0.0,0.0
2,29.0,0,1.0,0.0,0.0
3,29.0,1,1.0,0.0,0.0
4,33.0,0,0.0,1.0,0.0
