In [95]:
#import libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [96]:
#loading dataset
titanic_data = sns.load_dataset('titanic')
titanic_data.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [97]:
#handling null values
titanic_data['age'].fillna(titanic_data['age'].median(), inplace=True)
titanic_data['embarked'].fillna(titanic_data['embarked'].mode()[0], inplace=True)
titanic_data.drop('deck', axis=1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['age'].fillna(titanic_data['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['embarked'].fillna(titanic_data['embarked'].mode()[0], inplace=True)


In [98]:
titanic_data['family_size'] = titanic_data['sibsp'] + titanic_data['parch'] + 1
titanic_data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,family_size
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False,2
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True,1
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False,2
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True,1
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True,1
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False,4
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True,1


In [99]:
titanic_data['is_alone'] = 1
titanic_data['is_alone'].loc[titanic_data['family_size'] > 1 ] = 0

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  titanic_data['is_alone'].loc[titanic_data['family_size'] > 1 ] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

In [100]:
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,family_size,is_alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False,2,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False,2,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True,1,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False,2,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True,1,1


In [101]:
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,family_size,is_alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False,2,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False,2,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True,1,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False,2,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True,1,1


In [102]:
#Basic encoding for 'Sex' as 0 for male and 1 for female
titanic_data['sex'] = titanic_data['sex'].map({'male':0, 'female':1})
titanic_data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,family_size,is_alone
0,0,3,0,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False,2,0
1,1,1,1,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False,2,0
2,1,3,1,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True,1,1
3,1,1,1,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False,2,0
4,0,3,0,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True,1,1
887,1,1,1,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True,1,1
888,0,3,1,28.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False,4,0
889,1,1,0,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True,1,1


In [103]:
# One-hot encoding
titanic_data = pd.get_dummies(titanic_data, columns=['embarked'], drop_first=True)
titanic_data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,embark_town,alive,alone,family_size,is_alone,embarked_Q,embarked_S
0,0,3,0,22.0,1,0,7.2500,Third,man,True,Southampton,no,False,2,0,False,True
1,1,1,1,38.0,1,0,71.2833,First,woman,False,Cherbourg,yes,False,2,0,False,False
2,1,3,1,26.0,0,0,7.9250,Third,woman,False,Southampton,yes,True,1,1,False,True
3,1,1,1,35.0,1,0,53.1000,First,woman,False,Southampton,yes,False,2,0,False,True
4,0,3,0,35.0,0,0,8.0500,Third,man,True,Southampton,no,True,1,1,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,Second,man,True,Southampton,no,True,1,1,False,True
887,1,1,1,19.0,0,0,30.0000,First,woman,False,Southampton,yes,True,1,1,False,True
888,0,3,1,28.0,1,2,23.4500,Third,woman,False,Southampton,no,False,4,0,False,True
889,1,1,0,26.0,0,0,30.0000,First,man,True,Cherbourg,yes,True,1,1,False,False


In [104]:
#Target variables
y = titanic_data['survived']

#Features
X = titanic_data.drop('survived', axis=1)

In [105]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [106]:
#Split the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [107]:
# One-hot encode remaining categorical columns in X
X = pd.get_dummies(X, columns=['class', 'who', 'adult_male', 'embark_town', 'alive', 'alone'], drop_first=True)

# Split the dataset into training and testing again after encoding X
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Initialize the classifier and train the model
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

In [108]:
#Predict on the test data
y_pred = dt_classifier.predict(X_test)

In [109]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the decision tree classifier: {accuracy: .2f}")

Accuracy of the decision tree classifier:  1.00
