In [80]:
import pandas as pd
import numpy as np 

In [81]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer


In [82]:
df = pd.read_csv('tested.csv',usecols=['Age','Fare','Survived'])
df.dropna(inplace=True)

df.head()

Unnamed: 0,Survived,Age,Fare
0,0,34.5,7.8292
1,1,47.0,7.0
2,0,62.0,9.6875
3,0,27.0,8.6625
4,1,22.0,12.2875


In [83]:
df.isnull().sum()

Survived    0
Age         0
Fare        0
dtype: int64

In [84]:
X=df.drop(columns=['Survived'])
y=df['Survived']

In [85]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


In [86]:
X_train

Unnamed: 0,Age,Fare
281,0.75,13.7750
96,76.00,78.8500
341,32.00,7.5792
18,27.00,7.9250
26,22.00,61.9792
...,...,...
237,20.00,7.2250
86,27.00,7.8792
134,43.00,7.8958
345,16.00,7.6500


In [87]:
clf=DecisionTreeClassifier(random_state=42)
clf.fit(X_train,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [88]:
y_pred=clf.predict(X_test)


In [89]:
accuracy_score(y_test,y_pred)

0.582089552238806

In [90]:
np.mean(cross_val_score(DecisionTreeClassifier(),x,y,cv=10,scoring='accuracy'))



np.float64(0.5680926916221033)

In [91]:
kbin_age=KBinsDiscretizer(n_bins=15,encode='ordinal',strategy='quantile')
kbnin_fare=KBinsDiscretizer(n_bins=15,encode='ordinal',
                            strategy='quantile')


In [92]:
trf=ColumnTransformer([
    ('first',kbin_age,[0]),
    ('second',kbnin_fare,[1])   
])

In [93]:

X_train_trf=trf.fit_transform(X_train)
X_test_trf=trf.transform(X_test)    



In [94]:
# first bin boundaries calculated by kbindisctizer
trf.named_transformers_['first'].bin_edges_


array([array([ 0.33      , 13.53333333, 18.        , 20.        , 22.        ,
              24.        , 25.2       , 27.73333333, 30.        , 32.        ,
              36.        , 39.        , 43.8       , 47.93333333, 55.        ,
              76.        ])                                                   ],
      dtype=object)

In [95]:
# second bin boundaries calculated by kbindisctizer

trf.named_transformers_['second'].bin_edges_


array([array([  0.        ,   7.42554   ,   7.77805333,   7.8958    ,
                8.6625    ,  12.25276667,  13.        ,  14.89108667,
               21.        ,  26.        ,  27.7208    ,  39.        ,
               58.41      ,  78.67944667, 141.32528   , 512.3292    ])],
      dtype=object)

In [96]:
X_train

Unnamed: 0,Age,Fare
281,0.75,13.7750
96,76.00,78.8500
341,32.00,7.5792
18,27.00,7.9250
26,22.00,61.9792
...,...,...
237,20.00,7.2250
86,27.00,7.8792
134,43.00,7.8958
345,16.00,7.6500


In [97]:
X_train_trf

array([[ 0.,  6.],
       [14., 13.],
       [ 9.,  1.],
       [ 6.,  3.],
       [ 4., 12.],
       [ 2.,  4.],
       [ 3.,  6.],
       [ 5.,  4.],
       [ 3.,  3.],
       [11., 11.],
       [11., 11.],
       [ 8.,  6.],
       [11.,  0.],
       [11.,  9.],
       [13., 11.],
       [ 7.,  8.],
       [10., 10.],
       [ 2., 12.],
       [12., 13.],
       [ 2.,  0.],
       [13., 13.],
       [ 1., 12.],
       [ 9., 10.],
       [12.,  6.],
       [ 7.,  1.],
       [ 2., 11.],
       [ 5.,  1.],
       [ 8.,  9.],
       [ 9.,  5.],
       [ 8.,  1.],
       [ 6.,  4.],
       [ 2.,  6.],
       [ 8., 11.],
       [13., 14.],
       [ 8.,  7.],
       [ 8., 13.],
       [12., 11.],
       [13., 14.],
       [12.,  9.],
       [ 6.,  6.],
       [10.,  0.],
       [ 9.,  3.],
       [14.,  5.],
       [12., 10.],
       [ 7.,  2.],
       [ 9.,  3.],
       [ 9.,  2.],
       [ 2., 12.],
       [ 4.,  4.],
       [ 7., 14.],
       [11., 12.],
       [ 8.,  6.],
       [ 7.,

In [99]:
#converting numpy array to dataframe
output = pd.DataFrame({
    'age':X_train['Age'],
    'age_trf':X_train_trf[:,0],
    'fare':X_train['Fare'],
    'fare_trf':X_train_trf[:,1]
})

In [100]:
output

Unnamed: 0,age,age_trf,fare,fare_trf
281,0.75,0.0,13.7750,6.0
96,76.00,14.0,78.8500,13.0
341,32.00,9.0,7.5792,1.0
18,27.00,6.0,7.9250,3.0
26,22.00,4.0,61.9792,12.0
...,...,...,...,...
237,20.00,3.0,7.2250,0.0
86,27.00,6.0,7.8792,2.0
134,43.00,11.0,7.8958,3.0
345,16.00,1.0,7.6500,1.0


In [101]:
# adding bin labels to the dataframe using pd.cut method
output['age_labels'] = pd.cut(x=X_train['Age'],
                                    bins=trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x=X_train['Fare'],
                                    bins=trf.named_transformers_['second'].bin_edges_[0].tolist())

In [102]:
# now train a decision tree classifier on binned data
clf=DecisionTreeClassifier(random_state=42)
clf.fit(X_train_trf,y_train)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [103]:
y_pred2 = clf.predict(X_test_trf)


In [104]:
accuracy_score(y_test,y_pred2)


0.5970149253731343

In [105]:
X_trf = trf.fit_transform(x)
np.mean(cross_val_score(DecisionTreeClassifier(),x,y,cv=10,scoring='accuracy'))




np.float64(0.5680926916221033)

In [112]:
def discretize(bins,strategy):
    kbin_age = KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy=strategy)
    kbin_fare = KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy=strategy)
    
    trf = ColumnTransformer([
        ('first',kbin_age,[0]),
        ('second',kbin_fare,[1])
    ])
    
    X_trf = trf.fit_transform(X)
    print(np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy')))
    
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    plt.hist(X['Age'])
    plt.title("Before")

    plt.subplot(122)
    plt.hist(X_trf[:,0],color='red')
    plt.title("After")

    plt.show()
    
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    plt.hist(X['Fare'])
    plt.title("Before")

    plt.subplot(122)
    plt.hist(X_trf[:,1],color='red')
    plt.title("Fare")

    plt.show()