Ref: https://neptune.ai/blog/the-best-ml-framework-extensions-for-scikit-learn

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [18]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import check_is_fitted

from app.src.datasource import load_data


In [3]:
data =pd.DataFrame({
    'Name':['Ken','Jeff','John','Mike','Andrew','Ann','Sylvia','Dorothy','Emily','Loyford'],
    'Age':[31,52,56,12,45,50,78,85,46,135],
    'Phone':[52,79,80,75,43,125,74,44,85,45],
    'Uni':['One','Two','Three','One','Two','Three','One','Two','Three','One']
})

In [4]:
data

Unnamed: 0,Name,Age,Phone,Uni
0,Ken,31,52,One
1,Jeff,52,79,Two
2,John,56,80,Three
3,Mike,12,75,One
4,Andrew,45,43,Two
5,Ann,50,125,Three
6,Sylvia,78,74,One
7,Dorothy,85,44,Two
8,Emily,46,85,Three
9,Loyford,135,45,One


In [5]:
from sklearn_pandas import DataFrameMapper

mapper = DataFrameMapper([
     (['Uni'], LabelBinarizer()),
     (['Age'], StandardScaler()),
     (["Name", "Phone"], None)
 ], df_out=True)

In [6]:
mapper.fit_transform(data)

Unnamed: 0,Uni_One,Uni_Three,Uni_Two,Age,Name_Phone_0,Name_Phone_1
0,1,0,0,-0.872025,Ken,52
1,0,0,1,-0.218006,Jeff,79
2,0,1,0,-0.093431,John,80
3,1,0,0,-1.463755,Mike,75
4,0,0,1,-0.436012,Andrew,43
5,0,1,0,-0.280294,Ann,125
6,1,0,0,0.591731,Sylvia,74
7,0,0,1,0.809737,Dorothy,44
8,0,1,0,-0.404869,Emily,85
9,1,0,0,2.366924,Loyford,45


In [7]:
mapper.transformed_names_

['Uni_One', 'Uni_Three', 'Uni_Two', 'Age', 'Name_Phone_0', 'Name_Phone_1']

In [8]:
mapper.df_out

True

In [9]:
X_train, _ = load_data()

In [10]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
maptitanic = DataFrameMapper([
    (["Pclass"], None),
    (["Sex"], OrdinalEncoder())  # OneHotEncoder(sparse=False, handle_unknown="error")),
    ], 
    input_df=True,
    drop_cols=['PassengerId', 'Survived', 'Name', 'Ticket', 'Fare']
)

In [21]:
maptitanic.fit_transform(X_train)

array([[3., 1.],
       [1., 0.],
       [3., 0.],
       ...,
       [3., 0.],
       [1., 1.],
       [3., 1.]])

In [22]:
maptitanic.transformed_names_

['Pclass', 'Sex']