# **import Library**

In [16]:
import numpy as np
import pandas as pd

In [17]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

from sklearn.compose import ColumnTransformer

# **Upload the Data**

In [26]:
df =pd.read_csv('/content/titanic_dataset.csv')[['Age','Fare','SibSp','Parch','Survived']]

# **Check Missing Value**

In [27]:
df.isnull().sum()

Age         177
Fare          0
SibSp         0
Parch         0
Survived      0
dtype: int64

# **Drop Missing Value**

In [28]:
df.dropna(inplace=True)

In [29]:
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,22.0,7.25,1,0,0
1,38.0,71.2833,1,0,1
2,26.0,7.925,0,0,1
3,35.0,53.1,1,0,1
4,35.0,8.05,0,0,0


In [30]:
df['family'] =df ['SibSp'] + df['Parch']

In [8]:
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived,family
0,22.0,7.25,1,0,0,1
1,38.0,71.2833,1,0,1,1
2,26.0,7.925,0,0,1,0
3,35.0,53.1,1,0,1,1
4,35.0,8.05,0,0,0,0


# **Drop Columns**

In [31]:
df.drop(columns=['SibSp', 'Parch'], inplace=True)

In [32]:
df.head()

Unnamed: 0,Age,Fare,Survived,family
0,22.0,7.25,0,1
1,38.0,71.2833,1,1
2,26.0,7.925,1,0
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [33]:
X =df.drop(columns=['Survived'])
y =df['Survived']

# **Train and Test Dataset**

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [35]:
X_train.head()

Unnamed: 0,Age,Fare,family
328,31.0,20.525,2
73,26.0,14.4542,1
253,30.0,16.1,1
719,33.0,7.775,0
666,25.0,13.0,0


# **Without binarization**

In [52]:
# Without binarization

clf = DecisionTreeClassifier()

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

accuracy_score(y_test,y_pred)

0.6363636363636364

In [61]:
X = X[:143]

In [62]:
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))

0.5514285714285714

In [57]:
print(X.shape)
print(y.shape)

(714, 3)
(143,)


# **Applying Binarization**

# **Binarization is the process of converting raw data into binary values to “efficiently represent raw data as a presence”.**

In [75]:
# Applying Binarization

from sklearn.preprocessing import Binarizer

In [76]:
trf =ColumnTransformer([
    ('bin', Binarizer(copy=False), ['family'])

], remainder='passthrough')

In [77]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [78]:
pd.DataFrame(X_train_trf, columns=['family', 'Age', 'Fare'])

Unnamed: 0,family,Age,Fare
0,1.0,40.5,14.5000
1,1.0,40.0,9.4750
2,1.0,54.0,77.2875
3,1.0,7.0,39.6875
4,1.0,31.0,18.0000
...,...,...,...
109,1.0,46.0,61.1750
110,0.0,25.0,13.0000
111,0.0,55.0,16.0000
112,1.0,29.0,21.0000


In [81]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf,y_train)
y_pred2 = clf.predict(X_test_trf)

accuracy_score(y_test,y_pred2)

0.5172413793103449

In [82]:
print(len(y_test))
print(len(y_pred2))

29
29


In [83]:
y_pred2 =clf.predict(X_train_trf)

In [84]:
y_test = y_test[:len(y_pred2)]

In [73]:
y_pred2 =clf.predict(X_train_trf)

In [87]:

X_trf = trf.fit_transform(X)
np.mean(cross_val_score(DecisionTreeClassifier(),X_trf,y,cv=10,scoring='accuracy'))

0.5580952380952382