In [1]:
# import numpy for numerical operations
import numpy as np

# import pandas for data handling and analysis
import pandas as pd

In [2]:
# import function to split data into train and test sets
from sklearn.model_selection import train_test_split

# import function for cross validation scoring
from sklearn.model_selection import cross_val_score

# import DecisionTreeClassifier model
from sklearn.tree import DecisionTreeClassifier

# import accuracy_score to evaluate model performance
from sklearn.metrics import accuracy_score

# import ColumnTransformer to apply different transformers to different columns
from sklearn.compose import ColumnTransformer

In [3]:
# load selected columns from csv file into dataframe
df = pd.read_csv('train.csv')[['Age', 'Fare', 'SibSp', 'Parch', 'Survived']]

In [4]:
# remove rows with missing values
df.dropna(inplace=True)

In [5]:
# display first 5 rows of dataframe
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,22.0,7.25,1,0,0
1,38.0,71.2833,1,0,1
2,26.0,7.925,0,0,1
3,35.0,53.1,1,0,1
4,35.0,8.05,0,0,0


In [6]:
# add new column Family as sum of SibSp and Parch
df['Family'] = df['SibSp'] + df['Parch']

In [7]:
# display first 5 rows of dataframe
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived,Family
0,22.0,7.25,1,0,0,1
1,38.0,71.2833,1,0,1,1
2,26.0,7.925,0,0,1,0
3,35.0,53.1,1,0,1,1
4,35.0,8.05,0,0,0,0


In [8]:
# drop SibSp and Parch columns after creating Family
df.drop(columns=['SibSp', 'Parch'], inplace=True)

In [9]:
# display first 5 rows of dataframe
df.head()

Unnamed: 0,Age,Fare,Survived,Family
0,22.0,7.25,0,1
1,38.0,71.2833,1,1
2,26.0,7.925,1,0
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [10]:
# separate features and target variable
x = df.drop(columns=['Survived'])  # features: Age, Fare, Family
y = df['Survived']                 # target: Survived

In [11]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
# display first 5 rows of training features
x_train.head()

Unnamed: 0,Age,Fare,Family
328,31.0,20.525,2
73,26.0,14.4542,1
253,30.0,16.1,1
719,33.0,7.775,0
666,25.0,13.0,0


## **Without Binarization**

In [13]:
# create DecisionTreeClassifier model
clf = DecisionTreeClassifier()

# train model on training data
clf.fit(x_train, y_train)

# predict target values for test data
y_pred = clf.predict(x_test)

# calculate accuracy of the model on test data
accuracy_score(y_test, y_pred)

0.6153846153846154

In [14]:
# perform 10 fold cross validation using DecisionTreeClassifier
# calculate accuracy for each fold and take mean
np.mean(cross_val_score(DecisionTreeClassifier(), x, y, cv=10, scoring='accuracy'))

np.float64(0.6401017214397495)

## **Applying Binarization**

In [15]:
# import Binarizer to convert continuous values into 0 or 1
from sklearn.preprocessing import Binarizer

In [16]:
# create ColumnTransformer to apply Binarizer on Age column
trf = ColumnTransformer([
    ('bin', Binarizer(copy=False), ['Age'])  # binarize Age to 0/1 based on threshold
], remainder='passthrough')  # keep other columns unchanged

In [17]:
# apply ColumnTransformer on training data
x_train_trf = trf.fit_transform(x_train)

# apply same transformation on test data
x_test_trf = trf.transform(x_test)

In [18]:
# convert transformed array to dataframe with column names
pd.DataFrame(x_train_trf, columns=['Age', 'Fare', 'Family'])

Unnamed: 0,Age,Fare,Family
0,1.0,20.5250,2.0
1,1.0,14.4542,1.0
2,1.0,16.1000,1.0
3,1.0,7.7750,0.0
4,1.0,13.0000,0.0
...,...,...,...
566,1.0,61.1750,1.0
567,1.0,13.0000,0.0
568,1.0,134.5000,0.0
569,1.0,20.5250,2.0


In [19]:
# create DecisionTreeClassifier model
clf = DecisionTreeClassifier()

# train model on transformed training data
clf.fit(x_train_trf, y_train)

# predict target values for transformed test data
y_pred2 = clf.predict(x_test_trf)

# calculate accuracy of model on transformed test data
accuracy_score(y_test, y_pred2)

0.6783216783216783

In [20]:
# transform full feature set using column transformer
x_trf = trf.fit_transform(x)

# perform 10 fold cross validation on transformed data
# calculate average accuracy across all folds
np.mean(cross_val_score(DecisionTreeClassifier(), x_trf, y, cv=10, scoring='accuracy'))

np.float64(0.676643192488263)