#### A quick introduction to Decision Trees using Python

### Import Packages

In [None]:
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sns

### Read and Plot Data

In [None]:
data_url = 'https://bitbucket.org/vishal_derive/vcu-data-mining/raw/5b4bf03e5a3894c25ee67075f16850853f38ca3e/data/linreg_sample.csv'

df = pd.read_csv(data_url)

print(df.head())

sns.set(style='darkgrid')

plt.figure().set_size_inches(12, 9)

plt.scatter(df.x, df.y, color='lightcoral', s=100)

plt.xlabel('x', fontsize=14)
plt.ylabel('y', fontsize=14)
plt.title('Sample Data for Simple Linear Regression', fontsize=14, weight='semibold')

plt.show();

## Decision Tree

Step 1: Create a decision tree object.

In [None]:
max_d = 3

dtree = tree.DecisionTreeRegressor(max_leaf_nodes=max_d, random_state=314)

Step 2: Fit the model.

In [None]:
dtree.fit(df.x[:, np.newaxis], df.y)

Step 3: Make predictions.

In [None]:
df['y_hat'] = dtree.predict(df.x[:, np.newaxis])

Plot the original data points and the predictions.

In [None]:
sns.set(style='darkgrid')
plt.figure().set_size_inches(12, 9)

plt.scatter(df.x, df.y, color='lightcoral', s=100, alpha=.8)
plt.scatter(df.x, df.y_hat, color='forestgreen', label=f'Decision Tree (depth={max_d})')

plt.axvspan(0, .48, alpha=0.1, color='red')
plt.axvspan(.48, .8, alpha=0.1, color='yellow')
plt.axvspan(0.8, 1, alpha=0.1, color='orange')

plt.xlabel('x', fontsize = 14)
plt.ylabel('y', fontsize = 14)
plt.xlim((0, 1))
plt.ylim((-2, 2))
plt.legend(loc='best', fontsize = 14)
plt.title('Decision Tree', fontsize=14, weight='semibold')
plt.show();

In [None]:
df[['y', 'y_hat']].head(10)

In [None]:
plt.figure()
tree.plot_tree(dtree, filled=True)
plt.show();

## Decision Tree model using Wine Quality data

In [None]:
df_wine = pd.read_csv(r'../data/winequality.csv', index_col=0)

num_cols = df_wine.columns[df_wine.dtypes == 'float64']

dtree = tree.DecisionTreeRegressor(max_depth=5, random_state=314)

dtree_wine = dtree.fit(df_wine[num_cols], df_wine.quality)

df_wine['preds'] = dtree_wine.predict(df_wine[num_cols])

Mean Squared Error (MSE)

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(df_wine.quality, df_wine.preds)

What happens if we build a model with the default settings...

In [None]:
dtree = tree.DecisionTreeRegressor(random_state=314)

dtree_wine = dtree.fit(df_wine[num_cols], df_wine.quality)

df_wine['preds'] = dtree_wine.predict(df_wine[num_cols])

mean_squared_error(df_wine.quality, df_wine.preds)

In [None]:
df_wine[['quality', 'preds']].head(20)

This is called over-fitting. Decision Tree is a _greedy_ algorithm, prone to over-fitting.