# Binarization

## Binarization is the process of transforming data features of any entity into vectors of binary numbers to make classifier algorithms more efficient. 

## In a simple example, transforming an image's gray-scale from the 0-255 spectrum to a 0-1 spectrum is binarization.

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier


In [2]:
df = pd.read_csv('train.csv',usecols=['Age','Fare',"SibSp","Parch"])
df

Unnamed: 0,Age,SibSp,Parch,Fare
0,22.0,1,0,7.2500
1,38.0,1,0,71.2833
2,26.0,0,0,7.9250
3,35.0,1,0,53.1000
4,35.0,0,0,8.0500
...,...,...,...,...
886,27.0,0,0,13.0000
887,19.0,0,0,30.0000
888,,1,2,23.4500
889,26.0,0,0,30.0000


In [3]:
df.dropna(inplace=True)

In [4]:
df.head()

Unnamed: 0,Age,SibSp,Parch,Fare
0,22.0,1,0,7.25
1,38.0,1,0,71.2833
2,26.0,0,0,7.925
3,35.0,1,0,53.1
4,35.0,0,0,8.05


# Add SibSp and Parch

In [5]:
df["family"] = df.SibSp + df.Parch

In [6]:
df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,family
0,22.0,1,0,7.25,1
1,38.0,1,0,71.2833,1
2,26.0,0,0,7.925,0
3,35.0,1,0,53.1,1
4,35.0,0,0,8.05,0


In [7]:
df.drop(columns=["SibSp","Parch","Fare"],inplace=True)

In [8]:
df.head()

Unnamed: 0,Age,family
0,22.0,1
1,38.0,1
2,26.0,0
3,35.0,1
4,35.0,0


# Apply Binarization

In [9]:
from sklearn.preprocessing import Binarizer

# Binarizer(
    
#    threshold=0 default,
#    copy=True  default

# )

In [10]:
trf = ColumnTransformer(
    [
        ("bin",Binarizer(copy=False),["family"])
    ]
,remainder="passthrough"
)

In [11]:
new = trf.fit_transform(df)

In [12]:
n =  pd.DataFrame(new,columns=["FAMILY","AGE"])
n.tail()

Unnamed: 0,FAMILY,AGE
709,1.0,39.0
710,0.0,27.0
711,0.0,19.0
712,0.0,26.0
713,0.0,32.0


In [13]:
df.tail()

Unnamed: 0,Age,family
885,39.0,5
886,27.0,0
887,19.0,0
889,26.0,0
890,32.0,0


# Example 2

In [43]:
data = {"Grade":[1,2,3,4,5,6,7,8,9,10]}
data = pd.DataFrame(data)
data

Unnamed: 0,Grade
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


# set Threshold

In [50]:
bin_ = Binarizer(threshold=5)
new = bin_.fit_transform(data)

In [48]:
data["Grade Threshold"] = new 

In [49]:
data

Unnamed: 0,Grade,Grade Threshold
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
5,6,1
6,7,1
7,8,1
8,9,1
9,10,1
