In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [None]:
# built in data set to sklearn
iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=3)

# a decision tree is a type of supervised learning algorithm where a tree is constructed
# using rules that is learned from the data set. At the root of the tree, you would have
# the entire population and at the leaves of the tree, you would have the class label. 
# Each internal node represents a test on an attribute and each branch indicates the outcome
# of the test.

# in this example, we are classifying 3 different types of iris plants based on a number of features.
# One split at the top for example may be septal width. Because all 3 types of iris have different
# septal widths, we could split the data into two different sub sections where plants that had a 
# septal width of < 3cm would be a certain type of plant and the other two would have > 3cm. This
# process is continued until we have very specific feature splits and we arrive at the leaf nodes
# which will have class labels associated with them
clf = DecisionTreeClassifier().fit(X_train, y_train)

print("accuracy of decision tree (train): {}".format(clf.score(X_train, y_train)))
print("accuracy of decision tree (test): {}".format(clf.score(X_test, y_test)))

In [None]:
# we have seen that the previous model was overfitting. this was due to the fact that
# the model essentially memorized the training data by having pure leaf nodes at the bottom.
# This would lead to having very small ranges of classification for unseen values

# in order to combat the overfitting, we can try and limit the tree depth to be 3.
# also note that we can change the min_samples_leaf and max_leaf_nodes parameters as well.
# the min_samples_leaf parameter details the minimum number of samples a node must have
# in order to be split. The max_leaf_nodes parameter limits the max number of leaf nodes
# that the tree can have
clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

print("accuracy of decision tree (train): {}".format(clf.score(X_train, y_train)))
print("accuracy of decision tree (test): {}".format(clf.score(X_test, y_test)))

# feature importance explains how crucial a certain feature is when classifying it. A
# value closer to 1 signifies a higher importance while a value closer to zero signifies
# a lower importance
print("feature importances: {}".format(clf.feature_importances_))