In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer

In [3]:
df = pd.read_csv(r'C:\Users\AmiteshOP\Downloads\train.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
df=df[['Age','Fare','Survived']]
df

Unnamed: 0,Age,Fare,Survived
0,22.0,7.2500,0
1,38.0,71.2833,1
2,26.0,7.9250,1
3,35.0,53.1000,1
4,35.0,8.0500,0
...,...,...,...
885,39.0,29.1250,0
886,27.0,13.0000,0
887,19.0,30.0000,1
889,26.0,30.0000,1


In [6]:
df.dropna(inplace=True)

In [7]:
df.shape

(714, 3)

In [13]:
y=df['Survived']
X=df.drop('Survived',axis=1)

In [14]:
# train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,
                                                 random_state=42)

In [40]:
X_train

Unnamed: 0,Age,Fare
328,31.0,20.5250
73,26.0,14.4542
253,30.0,16.1000
719,33.0,7.7750
666,25.0,13.0000
...,...,...
92,46.0,61.1750
134,25.0,13.0000
337,41.0,134.5000
548,33.0,20.5250


In [17]:
# simple using ml Algo

dt = DecisionTreeClassifier()

dt.fit(X_train,y_train)
y_pred= dt.predict(X_test)

print('DT',accuracy_score(y_test,y_pred))

DT 0.6363636363636364


In [19]:
# cross validation
dt = DecisionTreeClassifier()

np.mean(cross_val_score(dt,X,y,scoring='accuracy',cv=10))

0.6274452269170578

# KbinDiscretizer

In [46]:
kbin_age = KBinsDiscretizer(n_bins=5, encode='ordinal',strategy='quantile')
kbin_fare = KBinsDiscretizer(n_bins=5, encode='ordinal',strategy='quantile')

In [47]:
trf= ColumnTransformer([
    ('first',kbin_age,[0]),
    ('second',kbin_fare,[1])
])

In [48]:
trf

In [49]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [50]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42, 19.  , 25.  , 32.  , 42.  , 80.  ])], dtype=object)

In [51]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42, 19.  , 25.  , 32.  , 42.  , 80.  ])], dtype=object)

In [52]:
output= pd.DataFrame({
    'age':X_train['Age'],
    'age_trf':X_train_trf[:,0],
    'fare':X_train['Fare'],
    'fare_trf':X_train_trf[:,1]
})

In [54]:
output['age_labels'] = pd.cut(x=X_train['Age'],
                                    bins=trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x=X_train['Fare'],
                                    bins=trf.named_transformers_['second'].bin_edges_[0].tolist())

In [55]:
output

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
328,31.0,2.0,20.5250,2.0,"(25.0, 32.0]","(13.0, 26.0]"
73,26.0,2.0,14.4542,2.0,"(25.0, 32.0]","(13.0, 26.0]"
253,30.0,2.0,16.1000,2.0,"(25.0, 32.0]","(13.0, 26.0]"
719,33.0,3.0,7.7750,0.0,"(32.0, 42.0]","(0.0, 7.896]"
666,25.0,2.0,13.0000,2.0,"(19.0, 25.0]","(7.896, 13.0]"
...,...,...,...,...,...,...
92,46.0,4.0,61.1750,4.0,"(42.0, 80.0]","(51.479, 512.329]"
134,25.0,2.0,13.0000,2.0,"(19.0, 25.0]","(7.896, 13.0]"
337,41.0,3.0,134.5000,4.0,"(32.0, 42.0]","(51.479, 512.329]"
548,33.0,3.0,20.5250,2.0,"(32.0, 42.0]","(13.0, 26.0]"


In [56]:
output['age_trf'].unique()

array([2., 3., 1., 4., 0.])