In [143]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

## Question 1

* Import the data and examine the features.
* We will be using all of them to predict `color` (white or red), but the colors feature will need to be integer encoded.


In [144]:
data = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML241EN-SkillsNetwork/labs/datasets/Wine_Quality_Data.csv", sep=',')

In [145]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [146]:
data['color'] = data.color.replace('white',0).replace('red',1).astype(np.int)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data['color'] = data.color.replace('white',0).replace('red',1).astype(np.int)


## Question 2

* Use `StratifiedShuffleSplit` to split data into train and test sets that are stratified by wine quality. If possible, preserve the indices of the split for question 5 below.
* Check the percent composition of each quality level for both the train and test data sets.


In [147]:
from sklearn.model_selection import StratifiedShuffleSplit

In [148]:
X= data.drop("color", axis="columns")
y = data.color

In [149]:
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2,
                             random_state=0)
train_index,test_index = next(sss.split(X, y))

In [150]:
X_train = X.loc[train_index]
X_test = X.loc[test_index]

y_train = y[train_index]
y_test = y[test_index]

percent composition of each quality level for both the train and test data sets.

In [151]:
y_train.value_counts(normalize=True) * 100

0    75.389648
1    24.610352
Name: color, dtype: float64

In [152]:
y_test.value_counts(normalize=True) * 100

0    75.384615
1    24.615385
Name: color, dtype: float64

## Question 3

* Fit a decision tree classifier with no set limits on maximum depth, features, or leaves.
* Determine how many nodes are present and what the depth of this (very large) tree is.
* Using this tree, measure the prediction error in the train and test data sets. What do you think is going on here based on the differences in prediction error?


In [153]:
from sklearn.tree import DecisionTreeClassifier

# create a regressor object
model = DecisionTreeClassifier()

# fit the regressor with X and Y data
model.fit(X_train, y_train)

In [154]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def measure_error(y_true, y_pred, label):
    return pd.Series({'accuracy':accuracy_score(y_true, y_pred),
                      'precision': precision_score(y_true, y_pred),
                      'recall': recall_score(y_true, y_pred),
                      'f1': f1_score(y_true, y_pred)},
                      name=label)

In [155]:
print(measure_error(y_test, model.predict(X_test), "Results"))

accuracy     0.983077
precision    0.971519
recall       0.959375
f1           0.965409
Name: Results, dtype: float64


In [156]:
model.tree_.node_count, model.tree_.max_depth

(141, 14)

## Question 4

* Using grid search with cross validation, find a decision tree that performs well on the test data set. Use a different variable name for this decision tree model than in question 3 so that both can be used in question 6.
* Determine the number of nodes and the depth of this tree.
* Measure the errors on the training and test sets as before and compare them to those from the tree in question 3.


In [157]:
from sklearn.model_selection import GridSearchCV

In [167]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the decision tree classifier
dt_classifier = DecisionTreeClassifier()

# Define the hyperparameter grid to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a grid search with cross-validation
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best decision tree classifier from the grid search
best_dt_classifier = grid_search.best_estimator_

# Evaluate the best model on the test data
test_accuracy = best_dt_classifier.score(X_test, y_test)

print(f"Best Decision Tree Classifier: {best_dt_classifier}")
print(f"Test Accuracy: {test_accuracy}")


ValueError: ignored

In [159]:
measure_error(y_test, model.predict(X_test),label="plain_decision_tree")

measure_error(y_test, best_dt_classifier.predict(X_test), label="grid_search_decision_tree")

decision_tree_results = pd.DataFrame({"grid_search_decision_tree":measure_error(y_test, best_dt_classifier.predict(X_test), label="grid_search_decision_tree"),"plain_decision_tree": measure_error(y_test, model.predict(X_test),label="plain_decision_tree")})

## Question 5

* Re-split the data into `X` and `y` parts, this time with `residual_sugar` being the predicted (`y`) data. *Note:* if the indices were preserved from the `StratifiedShuffleSplit` output in question 2, they can be used again to split the data.
* Using grid search with cross validation, find a decision tree **regression** model that performs well on the test data set.
* Measure the errors on the training and test sets using mean squared error.
* Make a plot of actual *vs* predicted residual sugar.


In [160]:
X= data.drop("residual_sugar", axis="columns")
y = data.residual_sugar

In [161]:
X_train = X.loc[train_index]
X_test = X.loc[test_index]

y_train = y[train_index]
y_test = y[test_index]

X_train

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
1593,6.8,0.62,0.08,0.068,28.0,38.0,0.99651,3.42,0.82,9.50,6,1
5630,6.4,0.29,0.57,0.060,15.0,120.0,0.99240,3.06,0.41,9.50,5,0
2569,5.1,0.42,0.01,0.017,25.0,102.0,0.98940,3.38,0.36,12.30,7,0
6319,6.6,0.27,0.32,0.044,18.0,93.0,0.99044,3.11,0.56,12.25,5,0
798,9.4,0.50,0.34,0.082,5.0,14.0,0.99870,3.29,0.52,10.70,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2371,6.6,0.21,0.60,0.135,61.0,144.0,0.99270,3.12,0.39,9.30,7,0
2323,7.8,0.30,0.40,0.028,23.0,122.0,0.99140,3.14,0.39,10.90,7,0
1494,6.4,0.31,0.09,0.066,15.0,28.0,0.99459,3.42,0.70,10.00,7,1
2590,8.2,0.26,0.44,0.046,7.0,69.0,0.99440,3.14,0.62,10.20,4,0


In [166]:

# Define the decision tree classifier
dt_classifier = DecisionTreeClassifier()

# Define the hyperparameter grid to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a grid search with cross-validation
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best decision tree classifier from the grid search
best_dt_classifier = grid_search.best_estimator_

# Evaluate the best model on the test data
test_accuracy = best_dt_classifier.score(X_test, y_test)

print(f"Best Decision Tree Classifier: {best_dt_classifier}")
print(f"Test Accuracy: {test_accuracy}")


ValueError: ignored

In [None]:
sns.set_context('notebook')
sns.set_style('white')
fig = plt.figure(figsize=(6,6))
ax = plt.axes()

ph_test_predict = pd.DataFrame({'test':y_test.values,
                                'predict': y_test_pred_gr_sugar}).set_index('test').sort_index()

ph_test_predict.plot(marker='o', ls='', ax=ax)
ax.set(xlabel='Test', ylabel='Predict', xlim=(0,35), ylim=(0,35));
### END SOLUTION