In [1]:
# import NumPy and pandas for data handling
import numpy as np
import pandas as pd

# import function to split dataset into training and testing sets
from sklearn.model_selection import train_test_split

# import SimpleImputer to handle missing values
from sklearn.impute import SimpleImputer

# import OneHotEncoder to convert categorical variables to numeric
from sklearn.preprocessing import OneHotEncoder

# import MinMaxScaler to scale features between 0 and 1
from sklearn.preprocessing import MinMaxScaler

# import DecisionTreeClassifier for classification tasks
from sklearn.tree import DecisionTreeClassifier

In [2]:
# load the dataset from 'train.csv' into a DataFrame
df = pd.read_csv('train.csv')

# display the first 5 rows of the dataset
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# drop columns that are not useful for modeling
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# display the first 5 rows of the dataset
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## **Train Test Split**

In [4]:
# split the dataset into training and testing sets
# features: all columns except 'Survived'
# target: 'Survived' column
# 80% training, 20% testing, with fixed random state for reproducibility
x_train, x_test, y_train, y_test = train_test_split(
    df.drop('Survived', axis=1),
    df['Survived'],
    test_size=0.2,
    random_state=42
)

In [5]:
# display the first 5 rows of the training feature data
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [6]:
# display the first 5 values of the training target (labels)
y_train.head()

Unnamed: 0,Survived
331,0
733,0
382,0
704,0
813,0


## **Apply Imputation**

In [7]:
# count missing (null) values in each column of the dataset
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [8]:
# create SimpleImputer for 'Age' column (default strategy = mean)
si_age = SimpleImputer()

# create SimpleImputer for 'Embarked' column using the most frequent value
si_embarked = SimpleImputer(strategy='most_frequent')

# fit and transform 'Age' & 'Embarked' column in training data
x_train_age = si_age.fit_transform(x_train[['Age']])
x_train_embarked = si_embarked.fit_transform(x_train[['Embarked']])

# transform 'Age' & 'Embarked' column in test data using same imputer from training respectively
x_test_age = si_age.transform(x_test[['Age']])
x_test_embarked = si_embarked.transform(x_test[['Embarked']])

In [9]:
# display the imputed 'Age' values from training data
x_train_age

array([[45.5       ],
       [23.        ],
       [32.        ],
       [26.        ],
       [ 6.        ],
       [24.        ],
       [45.        ],
       [29.        ],
       [29.49884615],
       [29.49884615],
       [42.        ],
       [36.        ],
       [33.        ],
       [17.        ],
       [29.        ],
       [50.        ],
       [35.        ],
       [38.        ],
       [34.        ],
       [17.        ],
       [11.        ],
       [61.        ],
       [30.        ],
       [ 7.        ],
       [63.        ],
       [20.        ],
       [29.49884615],
       [29.        ],
       [36.        ],
       [29.49884615],
       [50.        ],
       [27.        ],
       [30.        ],
       [33.        ],
       [29.49884615],
       [29.49884615],
       [ 2.        ],
       [25.        ],
       [51.        ],
       [25.        ],
       [29.49884615],
       [29.49884615],
       [24.        ],
       [18.        ],
       [29.49884615],
       [25

In [10]:
# display the imputed 'Embarked' values from training data
x_train_embarked

array([['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
      

## **One Hot Encoding**

In [11]:
# create OneHotEncoder objects for 'Sex' and 'Embarked' columns
ohe_sex = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_embarked = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# fit and transform the 'Sex' column from training data
x_train_sex = ohe_sex.fit_transform(x_train[['Sex']])

# fit and transform the imputed 'Embarked' column from training data
x_train_embarked = ohe_embarked.fit_transform(x_train_embarked)

# transform the 'Sex' column in test data using the same encoder
x_test_sex = ohe_sex.transform(x_test[['Sex']])

# transform the imputed 'Embarked' column in test data using the same encoder
x_test_embarked = ohe_embarked.transform(x_test_embarked)

In [12]:
# display the one-hot encoded values for the 'Sex' column in the training set
x_train_sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [13]:
# display the one-hot encoded values for the 'Embarked' column in the training set
x_train_embarked

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

## **Joining Remaining Columns**

In [14]:
# drop the already processed columns ('Age', 'Sex', 'Embarked') from training features
x_train_rem = x_train.drop(columns=['Age', 'Sex', 'Embarked'])

# do the same for test features
x_test_rem = x_test.drop(columns=['Age', 'Sex', 'Embarked'])

In [15]:
# combine all processed training features: remaining columns + age + sex + embarked
x_train_transformed = np.concatenate((x_train_rem, x_train_age, x_train_sex, x_train_embarked), axis=1)

# combine all processed test features similarly
x_test_transformed = np.concatenate((x_test_rem, x_test_age, x_test_sex, x_test_embarked), axis=1)

In [16]:
# display the full transformed training feature matrix
x_train_transformed

array([[1., 0., 0., ..., 0., 0., 1.],
       [2., 0., 0., ..., 0., 0., 1.],
       [3., 0., 0., ..., 0., 0., 1.],
       ...,
       [3., 2., 0., ..., 0., 0., 1.],
       [1., 1., 2., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.]])

In [17]:
# check the shape of transformed training and test feature matrices
x_train_transformed.shape, x_test_transformed.shape

((712, 10), (179, 10))

## **Model Training**

In [18]:
# create a Decision Tree classifier
clf = DecisionTreeClassifier()

# train the model on the transformed training data
clf.fit(x_train_transformed, y_train)

## **Model Testing**

In [19]:
# use the trained Decision Tree model to make predictions on the test data
y_pred = clf.predict(x_test_transformed)

# display the predicted values (0 = not survived, 1 = survived)
y_pred

array([0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1])

## **Model Performance**

In [20]:
# import the accuracy_score function from scikit-learn
from sklearn.metrics import accuracy_score

# calculate and print the accuracy of model predictions on test data
accuracy_score(y_test, y_pred)

0.7821229050279329

## **Pickling**

In [21]:
# import 'os' module to handle folder and file operations
import os

# import 'pickle' module to save and load Python objects like models and encoders
import pickle

# create the 'models' folder if it doesn't already exist
os.makedirs('models', exist_ok=True)

# save the fitted OneHotEncoder for the 'Sex' column to a file
pickle.dump(ohe_sex, open('models/ohe_sex.pkl', 'wb'))

# save the fitted OneHotEncoder for the 'Embarked' column to a file
pickle.dump(ohe_embarked, open('models/ohe_embarked.pkl', 'wb'))

# save the trained Decision Tree model to a file
pickle.dump(clf, open('models/clf.pkl', 'wb'))
