In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [4]:
df = pd.read_csv("Titanic-Dataset.csv", usecols=['Age', 'Fare', 'Survived'])

In [11]:
df.head()
# df.shape

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [6]:
df.dropna(inplace=True)

In [8]:
df.shape

(714, 3)

In [14]:
X = df.iloc[:,1:]
y = df['Survived']

In [15]:
X

Unnamed: 0,Age,Fare
0,22.0,7.2500
1,38.0,71.2833
2,26.0,7.9250
3,35.0,53.1000
4,35.0,8.0500
...,...,...
885,39.0,29.1250
886,27.0,13.0000
887,19.0,30.0000
889,26.0,30.0000


In [16]:
y

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: Survived, Length: 714, dtype: int64

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [28]:
X_train.head(1)

Unnamed: 0,Age,Fare
328,31.0,20.525


In [18]:
clf = DecisionTreeClassifier()

In [22]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_train)
y_pred1 = clf.predict(X_test)

In [24]:
accuracy_score(y_pred,y_train),accuracy_score(y_pred1,y_test)

(0.9754816112084063, 0.6293706293706294)

In [25]:
np.mean(cross_val_score(clf,X,y,cv=10,scoring='accuracy'))

np.float64(0.6260758998435054)

In [61]:
kage = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')
kfare = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile', quantile_method='averaged_inverted_cdf')

In [62]:
trf = ColumnTransformer([
    ('first',kage,[0]),
    ('second',kfare,[1])
])

In [63]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)



In [76]:
X_train_trf

array([[3., 5.],
       [3., 4.],
       [3., 5.],
       ...,
       [5., 9.],
       [4., 5.],
       [4., 2.]], shape=(571, 2))

In [64]:
trf.n_features_in_, trf.feature_names_in_

(2, array(['Age', 'Fare'], dtype=object))

In [65]:
trf.named_transformers_['first'].n_bins_, trf.named_transformers_['first'].bin_edges_

(array([10]),
 array([array([ 0.42      ,  8.95396298, 17.85460643, 24.83453636, 32.41986162,
               40.34522257, 48.11193503, 56.08004386, 64.26754386, 72.58333333,
               80.        ])                                                   ],
       dtype=object))

In [77]:
output = pd.DataFrame({
    'age': X_train['Age'],
    'age_trf': X_train_trf[:,0],
    'fare': X_train['Fare'],
    'fare_trf': X_train_trf[:,1],
})

In [78]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf
205,2.0,0.0,10.4625,3.0
401,26.0,3.0,8.05,2.0
70,32.0,3.0,10.5,3.0
34,28.0,3.0,82.1708,9.0
550,17.0,1.0,110.8833,9.0


In [83]:
output['age_range'] = pd.cut(x=X_train['Age'], bins=trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_range'] = pd.cut(x=X_train['Fare'], bins=trf.named_transformers_['second'].bin_edges_[0].tolist())

In [85]:
output.head(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_range,fare_range
328,31.0,3.0,20.525,5.0,"(24.835, 32.42]","(15.75, 26.0]"
73,26.0,3.0,14.4542,4.0,"(24.835, 32.42]","(13.0, 15.75]"
253,30.0,3.0,16.1,5.0,"(24.835, 32.42]","(15.75, 26.0]"
719,33.0,4.0,7.775,1.0,"(32.42, 40.345]","(7.75, 7.896]"
666,25.0,3.0,13.0,4.0,"(24.835, 32.42]","(9.225, 13.0]"


In [86]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf,y_train)
y_pred2 = clf.predict(X_test_trf)

In [88]:
accuracy_score(y_pred2,y_test)

0.6433566433566433

In [90]:
np.mean(cross_val_score(clf,X,y,cv=10,scoring='accuracy'))

np.float64(0.6303012519561815)