In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('/content/titanic_dataset.csv', usecols=['Age', 'Fare', 'Survived'])
df.dropna(inplace=True)
df

Unnamed: 0,Survived,Age,Fare
0,0,34.5,7.8292
1,1,47.0,7.0000
2,0,62.0,9.6875
3,0,27.0,8.6625
4,1,22.0,12.2875
...,...,...,...
409,1,3.0,13.7750
411,1,37.0,90.0000
412,1,28.0,7.7750
414,1,39.0,108.9000


In [3]:
x = df.iloc[:,1:]
y = df.iloc[:,0]
x

Unnamed: 0,Age,Fare
0,34.5,7.8292
1,47.0,7.0000
2,62.0,9.6875
3,27.0,8.6625
4,22.0,12.2875
...,...,...
409,3.0,13.7750
411,37.0,90.0000
412,28.0,7.7750
414,39.0,108.9000


In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
clf=DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.5671641791044776

In [6]:
clf = DecisionTreeClassifier()
np.mean(cross_val_score(clf, x, y, cv=10, scoring='accuracy'))

0.5590909090909092

In [7]:
kbin_age=KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
kbin_fare=KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')

In [8]:
trf=ColumnTransformer([
    ('first', kbin_age,[0]),
    ('second', kbin_fare,[1])
])

In [9]:
x_train_trf = trf.fit_transform(x_train)
x_test_tef = trf.transform(x_test)

In [14]:
trf.named_transformers_['second'].bin_edges_

array([array([  0.    ,   7.75  ,   7.8958,  10.5   ,  13.    ,  16.    ,
               26.    ,  31.5   ,  58.41  ,  92.45  , 512.3292])         ],
      dtype=object)

In [16]:
output =  pd.DataFrame({
    'age' : x_train['Age'],
    'age_trf' : x_train_trf[:,0],
    'fare' : x_train['Fare'],
    'fare_trf' : x_train_trf[:,1]
})
output

Unnamed: 0,age,age_trf,fare,fare_trf
281,0.75,0.0,13.7750,4.0
96,76.00,9.0,78.8500,8.0
341,32.00,6.0,7.5792,0.0
18,27.00,4.0,7.9250,2.0
26,22.00,2.0,61.9792,8.0
...,...,...,...,...
237,20.00,2.0,7.2250,0.0
86,27.00,4.0,7.8792,1.0
134,43.00,7.0,7.8958,2.0
345,16.00,0.0,7.6500,0.0


In [18]:
output['age_labels'] = pd.cut(x=x_train['Age'], bins=trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x=x_train['Fare'], bins=trf.named_transformers_['second'].bin_edges_[0].tolist())
output

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
281,0.75,0.0,13.7750,4.0,"(0.33, 17.0]","(13.0, 16.0]"
96,76.00,9.0,78.8500,8.0,"(50.0, 76.0]","(58.41, 92.45]"
341,32.00,6.0,7.5792,0.0,"(29.0, 32.0]","(0.0, 7.75]"
18,27.00,4.0,7.9250,2.0,"(25.2, 29.0]","(7.896, 10.5]"
26,22.00,2.0,61.9792,8.0,"(20.0, 23.0]","(58.41, 92.45]"
...,...,...,...,...,...,...
237,20.00,2.0,7.2250,0.0,"(17.0, 20.0]","(0.0, 7.75]"
86,27.00,4.0,7.8792,1.0,"(25.2, 29.0]","(7.75, 7.896]"
134,43.00,7.0,7.8958,2.0,"(37.0, 43.8]","(7.75, 7.896]"
345,16.00,0.0,7.6500,0.0,"(0.33, 17.0]","(0.0, 7.75]"


In [19]:
clf=DecisionTreeClassifier()
clf.fit(x_train_trf, y_train)
y_pred2=clf.predict(x_test_tef)
accuracy_score(y_test, y_pred2)

0.5223880597014925

In [20]:
clf = DecisionTreeClassifier()
x_trf = trf.fit_transform(x)
np.mean(cross_val_score(clf, x_trf, y, cv=10, scoring='accuracy'))

0.5590909090909092

## We can use different stratergy in Kbins & Similarly we can try Binarization in many scenarios as well