## Binarization
In k means we used to convert continuous values into discrete values, but in Binarization we will convert continus values into binary values

In [66]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import Binarizer         # to use Binarization
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')   

In [67]:
df = pd.read_csv('train.csv')[['Age', 'Fare', 'SibSp', 'Parch', 'Survived']]
df.dropna(inplace=True)
df.head(3)

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,22.0,7.25,1,0,0
1,38.0,71.2833,1,0,1
2,26.0,7.925,0,0,1


In [68]:
# merging two columns
df['family'] = df['SibSp'] + df['Parch']
print(df.head(3))

# dimensionality reduction - dropping two columns
df.drop(columns=['SibSp', 'Parch'], inplace=True)
print(df.head(3))

x = df.drop(columns='Survived')
y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

x_train.head(3)

    Age     Fare  SibSp  Parch  Survived  family
0  22.0   7.2500      1      0         0       1
1  38.0  71.2833      1      0         1       1
2  26.0   7.9250      0      0         1       0
    Age     Fare  Survived  family
0  22.0   7.2500         0       1
1  38.0  71.2833         1       1
2  26.0   7.9250         1       0


Unnamed: 0,Age,Fare,family
328,31.0,20.525,2
73,26.0,14.4542,1
253,30.0,16.1,1


### Without Binarization

In [69]:
# without binarization

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(f"Accuracy : {np.round(accuracy_score(y_test, y_pred),2)*100} %")
print(f"Cross val : {np.round(np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='accuracy')),2)*100} %")
print()


Accuracy : 62.0 %
Cross val : 63.0 %



### With Binarization

In [None]:
# with binarization
# to use this you have to import Binarizer class
# from skelarn.preprocessing import Binarizer

trf = ColumnTransformer([
    ('bin', Binarizer(copy=False), ['family'])
], remainder='passthrough')

x_train_trf = trf.fit_transform(x_train)
x_test_trf = trf.transform(x_test)

x_train_trf = pd.DataFrame(x_train_trf, columns=['family', 'Age', 'Fare'])
print(x_train_trf.head(3))
# 1 in family shows, the passenger is travelling with someone
# 0 is family, shows, the passenger is travelling alone


clf.fit(x_train_trf, y_train)
y_pred2 = clf.predict(x_test_trf)

print()
print(f"Accuracy : {np.round(accuracy_score(y_test, y_pred2),2)*100} %")
print(f"Cross val : {np.round(np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='accuracy')),2)*100} %")
print()

   family   Age     Fare
0     1.0  31.0  20.5250
1     1.0  26.0  14.4542
2     1.0  30.0  16.1000

Accuracy : 63.0 %
Cross val : 64.0 %

