In [1]:
import pandas as pd
import numpy as np

## Ordinal data transformation

In [2]:
X = pd.DataFrame(data = 
             [['M', 'O-', 'medium','mid-lev'],
              ['M', 'O-', 'high','high-lev'],
              ['F', 'O+', 'high','high-lev'],
              ['F', 'AB', 'low','low-lev'],
              ['F', 'B+', 'medium','mid-lev']],
            columns=['sex', 'blood_type', 'edu_level','job_level'])
X.head()

Unnamed: 0,sex,blood_type,edu_level,job_level
0,M,O-,medium,mid-lev
1,M,O-,high,high-lev
2,F,O+,high,high-lev
3,F,AB,low,low-lev
4,F,B+,medium,mid-lev


In [3]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[['low','medium', 'high'],['low-lev','mid-lev', 'high-lev']], dtype=np.int8)

In [4]:
X[['edu_level','job_level']] = pd.DataFrame(encoder.fit_transform(X[['edu_level','job_level']]), dtype=np.int8)

In [5]:
X.head()

Unnamed: 0,sex,blood_type,edu_level,job_level
0,M,O-,1,1
1,M,O-,2,2
2,F,O+,2,2
3,F,AB,0,0
4,F,B+,1,1


## One hot encoding for non ordinal data with out varriable trap

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [8]:
categorical_feature_mask=X.dtypes == object
categorical_masked_cols = X.columns[categorical_feature_mask].tolist()

In [10]:
ct = ColumnTransformer([('encoder', 
                         OneHotEncoder(categories=[['M','F'],['O-','O+','AB','B+']],drop='first'), 
                         categorical_masked_cols)], remainder='passthrough')
cat_data = np.array(ct.fit_transform(X[categorical_masked_cols]), dtype=np.int)
encoded_df = pd.DataFrame(data=cat_data,columns=['F','O+','AB','B+']).join(X[['edu_level','job_level']])
encoded_df.head()

Unnamed: 0,F,O+,AB,B+,edu_level,job_level
0,0,0,0,0,1,1
1,0,0,0,0,2,2
2,1,1,0,0,2,2
3,1,0,1,0,0,0
4,1,0,0,1,1,1


## Data Bining

In [12]:
from sklearn.preprocessing import KBinsDiscretizer

In [11]:
df = pd.DataFrame({'Age':[*range(9)]})

In [13]:
disc = KBinsDiscretizer(n_bins=3, encode='ordinal', 
                        strategy='quantile')
df1 = pd.DataFrame(data=disc.fit_transform(df),dtype=np.int)
df1.head(10)

Unnamed: 0,0
0,0
1,0
2,0
3,1
4,1
5,1
6,2
7,2
8,2


## Threshold based bining

In [15]:
from sklearn.preprocessing import Binarizer

In [16]:
df = pd.DataFrame({'vals':np.random.uniform(low=.1, high=.8, size=5)})
df.head()

Unnamed: 0,vals
0,0.234219
1,0.58452
2,0.175725
3,0.26278
4,0.316


In [17]:
binr = Binarizer(threshold=.5,copy=True)
df['binary_val']=binr.fit_transform(df)
df.head()

Unnamed: 0,vals,binary_val
0,0.234219,0.0
1,0.58452,1.0
2,0.175725,0.0
3,0.26278,0.0
4,0.316,0.0
