<a href="https://colab.research.google.com/github/AdrianduPlessis/CheatSheets/blob/master/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook that shows quick and easy preprocessing for baseline model

## Imports

In [0]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

## Data

In [0]:
columns = ["name", "age"]
data = [["Alvin", 23], [np.nan, 21], ["Riken", np.nan]]
df = pd.DataFrame(columns= columns, data = data)

# *** Note that prior to this step, you remove the target column ***

## Split numeric and categorical

In [0]:
numeric = df.select_dtypes(include= "number").columns
categorical = df.select_dtypes(exclude = "number").columns

## Create Categorical and Numeric preprocessing pipelines

In [0]:
c_steps = [('c_imputer', SimpleImputer(strategy="most_frequent"))]
c_pipeline = Pipeline(c_steps)

n_steps = [('n_imputer', SimpleImputer())]
n_pipeline = Pipeline(n_steps)

## Fit pipelines to corresponsing datatypes

In [0]:
df[numeric] = n_pipeline.fit_transform(df[numeric])
df[categorical] = c_pipeline.fit_transform(df[categorical])

# Use first function if One hot encoder is not a step in your categorical preprocessing

In [0]:
def simple_preprocess(df):
  
  numeric = df.select_dtypes(include= "number").columns
  categorical = df.select_dtypes(exclude = "number").columns
  
  c_steps = [('c_imputer', SimpleImputer(strategy="most_frequent")),
            ('ordinal', OrdinalEncoder())]
  c_pipeline = Pipeline(c_steps)
  
  n_steps = [('n_imputer', SimpleImputer())]
  n_pipeline = Pipeline(n_steps)
  
  df[numeric] = n_pipeline.fit_transform(df[numeric])
  df[categorical] = c_pipeline.fit_transform(df[categorical])
  
  return df

In [0]:
def simple_preprocess(df):
  
  numeric = df.select_dtypes(include= "number")
  categorical = df.select_dtypes(exclude = "number")
  
  c_steps = [('c_imputer', SimpleImputer(strategy="most_frequent")),
            ('onehot', OneHotEncoder())]
  c_pipeline = Pipeline(c_steps)
  
  n_steps = [('n_imputer', SimpleImputer()),
            ('ordinal', OrdinalEncoder())]
  n_pipeline = Pipeline(n_steps)
  
  numeric_data = n_pipeline.fit_transform(numeric)
  categorical_data = c_pipeline.fit_transform(categorical).toarray()
  
  numeric = pd.DataFrame(columns= numeric.columns, data=numeric_data)
  categorical = pd.DataFrame(columns= c_pipeline['onehot'].categories_, data=categorical_data)
  
  df = pd.concat([numeric, categorical], axis = 1)
  
  return df

In [0]:
simple_preprocess(df)

Unnamed: 0,name,age
0,0.0,23.0
1,0.0,21.0
2,1.0,22.0
