In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import graphviz

%matplotlib inline
sns.set(rc={"figure.figsize": (10, 7)})


In [43]:
titanic_data = pd.read_csv(
    r"D:\Projects\PythonProjects\DS-Rush\data\titanic\train.csv")

X = titanic_data.drop(
    ["PassengerId", "Survived", "Name", "Ticket", "Cabin"], axis=1)
y = titanic_data.Survived

X = pd.get_dummies(X)
X.fillna({"Age": X.Age.median()}, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


In [44]:
scoring = ['precision_macro', 'recall_macro', "f1_macro", "accuracy"]
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=1)


In [45]:
scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=3)
scores


{'fit_time': array([0.0019989 , 0.00199938, 0.00299692]),
 'score_time': array([0.00399804, 0.00499749, 0.00299788]),
 'test_precision_macro': array([0.78537839, 0.71472185, 0.80409024]),
 'test_recall_macro': array([0.78210811, 0.7052973 , 0.79086748]),
 'test_f1_macro': array([0.78364862, 0.7089439 , 0.79624531]),
 'test_accuracy': array([0.79899497, 0.73366834, 0.81313131])}

In [51]:
params = {
    "max_depth": range(1, 11),
    "min_samples_split": range(2, 11),
    "min_samples_leaf": range(2, 11)
}


iris = load_iris()
X = iris.data
y = iris.target
clf = tree.DecisionTreeClassifier()


search = GridSearchCV(clf, param_grid=params)
search.fit(X=X, y=y)

In [53]:
best_tree = search.best_estimator_

In [56]:
params = {
    "max_depth": range(1, 11),
    "min_samples_split": range(2, 11),
    "min_samples_leaf": range(1, 11)
}


iris = load_iris()
X = iris.data
y = iris.target
clf = tree.DecisionTreeClassifier()


search = RandomizedSearchCV(clf, param_distributions=params)
search.fit(X=X, y=y)

In [58]:
best_tree = search.best_estimator_
best_tree

In [None]:
y_train = train["y"]
X_train = train.drop("y", axis=1)

y_test = test["y"]
X_test = test.drop("y", axis=1)

In [None]:
params = {
    "max_depth": range(1, 11),
    "min_samples_split": range(2, 11),
    "min_samples_leaf": range(1, 11)
}

clf = tree.DecisionTreeClassifier()

search = GridSearchCV(clf, param_grid=params)
search.fit(X=X_train, y=y_train)
best_tree = search.best_estimator_


In [62]:

predictions = best_tree.predict(X_test)
predictions

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0], dtype=int64)

In [65]:
confusion_matrix(y_test, predictions)

array([[152,  23],
       [ 37,  83]], dtype=int64)