In [14]:
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### Preprocessing

In [15]:

df = pd.read_csv("../datasets/titanic.csv")
# preprocessing: task 4
# 1. identifying categorical and numerical features
features = ['Sex', 'Embarked', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
categorical_features= ['Sex', 'Embarked']
numerical_features=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

#2. preprocessing for numerical data, median to impute missing value and standard scaling for bringing mean to 0 and std var to 1
numerical_transformer= Pipeline(steps=
                                [('imputer' , SimpleImputer(strategy='median')),
                                ('scaler', StandardScaler())] )

# 3. preprocessing for categorical data
categorical_transformer=Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('OneHot', OneHotEncoder(handle_unknown='ignore'))
])

# 4. Bundle Processing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)


In [16]:
# seperating target variable 'y' and features 'x'
x= df[features]
y = df["Survived"]
#splitting into test and train
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2)
# Preprocess the training and test data
X_train_preprocessed = preprocessor.fit_transform(x_train)
X_test_preprocessed = preprocessor.transform(x_test)


### Implementing Decision Tree

In [20]:

# training the model 
# consider techbiques such as cross validation for finding the optimal max_depth to avoid underfitting or overfitting
classifier = tree.DecisionTreeClassifier(max_depth= 4,random_state=42)
classifier.fit(X_train_preprocessed, y_train)

In [21]:
#visualing the tree

plt.figure(figsize=(20, 10), facecolor='k')

# Get feature names after preprocessing
feature_names_cat = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out()
feature_names = list(feature_names_cat) + numerical_features

# Ensure class names
class_names = [str(cls) for cls in y.unique()]

# Plot the decision tree
tree.plot_tree(classifier, feature_names=feature_names, class_names=class_names, rounded=True, filled=True, fontsize=14)

plt.show()

KeyError: 'onehot'

<Figure size 2000x1000 with 0 Axes>