# Decision Tree

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Exploratory Data Analysis and Visualization

In [None]:
df = pd.read_csv("../../common-data/penguins_size.csv")
df = df.dropna()
df.head()

In [None]:
df['species'].unique()

In [None]:
sns.scatterplot(data=df, x='culmen_length_mm', y='culmen_depth_mm', hue='species')

In [None]:
sns.pairplot(df, hue='species')

In [None]:
sns.catplot(data=df, x='species', y='culmen_length_mm', kind='box', col='sex')

## Train and Test Splits

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = pd.get_dummies(df.drop('species', axis=1), drop_first=True)
y = df['species']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

## Creating the model

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()

model.fit(X_train, y_train)

## Predictions on the test data

In [None]:
y_pred = model.predict(X_test)
y_pred

## Model Performance

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

### Confusion matrix

In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cnf_matrix)
disp.plot()

### Classification report

In [None]:
print(classification_report(y_test, y_pred))

### Feature importances

In [None]:
model.feature_importances_

In [None]:
pd.DataFrame(index=X.columns, data=model.feature_importances_, columns=['Feature Importance']).sort_values('Feature Importance')

In [None]:
sns.boxplot(data=df, x='species', y='body_mass_g')

## Visualize the Tree

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(12, 8), dpi=200)
plot_tree(model, feature_names=X.columns, filled=True);

## Reporting Model Results

In [None]:
def report_model(model):
    model_preds = model.predict(X_test)
    print(classification_report(y_test, model_preds))
    print('\n')
    plt.figure(figsize=(12, 8))
    plot_tree(model, feature_names=X.columns, filled=True,);

### Experiment with Max Depth

In [None]:
pruned_tree = DecisionTreeClassifier(max_depth=2)
pruned_tree.fit(X_train, y_train)

report_model(pruned_tree)

### Experiment with Max Leaf Nodes

In [None]:
max_leaf_tree = DecisionTreeClassifier(max_leaf_nodes=3)
max_leaf_tree.fit(X_train, y_train)

report_model(max_leaf_tree)

### Experimenting with Criterion

In [None]:
entropy_tree = DecisionTreeClassifier(criterion='entropy')
entropy_tree.fit(X_train, y_train)

report_model(entropy_tree)