In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
titanic = sns.load_dataset("titanic")

In [None]:
titanic.head()

In [None]:
titanic.describe
titanic.info()

In [None]:
titanic.isnull().sum()

In [None]:
features = ["pclass", "sex", "fare", "embarked", "age"]
target = ["survived"]

In [None]:
#Missing data
from sklearn.impute import SimpleImputer

imp_median = SimpleImputer(strategy = "median")
titanic[["age"]] = imp_median.fit_transform(titanic[["age"]])

imp_freq = SimpleImputer(strategy = "most_frequent")
titanic[["embarked"]] = imp_freq.fit_transform(titanic[["embarked"]])


In [None]:
titanic.isnull().sum()

In [None]:
#Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

titanic["sex"] = le.fit_transform(titanic["sex"])
titanic["embarked"] = le.fit_transform(titanic["embarked"])

In [None]:
titanic.head()

In [None]:
X = titanic[features]
y = titanic[target]

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3,
    random_state = 42
)

In [None]:
X_test.head()

In [None]:
#Decision Tree Model - no pruning
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)

print("accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
#Plotting

from sklearn.tree import plot_tree

plt.figure(figsize = (18, 10))
plot_tree(
    model,
    feature_names = X.columns,
    class_names = ["Died", "Survived"],
    filled = True,
    max_depth = 4
)

plt.tight_layout()
plt.show()

# Decision Tree with pre-prunning

In [None]:
max_depths = [2,3,4,5,6,7,8,9,10]

for depth in max_depths:
    model = DecisionTreeClassifier(max_depth = depth)
    model.fit(X_train, y_train)

    acc = model.score(X_test, y_test)
    print(f"for depth = {depth}, accuracy = {acc}")

In [None]:
min_samples_splits = [10,15,20,25,30,35,40]

for split in min_samples_splits:
    model = DecisionTreeClassifier(max_depth = 4, min_samples_split = split )
    model.fit(X_train, y_train)

    acc = model.score(X_test, y_test)
    print(f"for depth = {split}, accuracy = {acc}")

In [None]:
max_depths = [2,3,4,5,6,7,8,9,10]

for depth in max_depths:
    model = DecisionTreeClassifier(max_depth = depth)
    model.fit(X_train, y_train)

    if depth == 4:
        plt.figure(figsize = (18, 10))
        plot_tree(
            model,
            feature_names = X.columns,
            class_names = ["Died", "Survived"],
            filled = True,
        )
        
        plt.tight_layout()
        plt.show()

In [None]:
min_samples_splits = [10,15,20,25,30,35,40]

for split in min_samples_splits:
    model = DecisionTreeClassifier(max_depth = 4, min_samples_split = split )
    model.fit(X_train, y_train)

    if split == 10:
        plt.figure(figsize = (18, 10))
        plot_tree(
            model,
            feature_names = X.columns,
            class_names = ["Died", "Survived"],
            filled = True,
        )
        
        plt.tight_layout()
        plt.show()

# Decision Tree With Post - Pruning

In [None]:
full_tree = DecisionTreeClassifier(random_state = 42)
full_tree.fit(X_train, y_train)

In [None]:
path = full_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
print(ccp_alphas)

In [None]:
#train our model for all alphas
trees = []
for alpha in ccp_alphas:
    model = DecisionTreeClassifier(random_state = 42, ccp_alpha = alpha)
    model.fit(X_train, y_train)

    trees.append((model, alpha))
    

In [None]:
best_acc = 0
best_alpha = 0

for model, alpha in trees:
    curr_acc = model.score(X_test, y_test)
    if curr_acc > best_acc:
        best_acc = curr_acc
        best_alpha = alpha

In [None]:
print(best_acc)
print(best_alpha)