### 7.1) Import modules

In [None]:
from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.model_selection import train_test_split, KFold, learning_curve
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import pickle

### 7.2) Load and split Diabetes dataset for regression

In [None]:
X, y = load_diabetes(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=0)

### 7.3) Inspect data statistics

In [None]:
print('Target Mean: ', y.mean())
print('Target SD: ', y.std())
print('Counts: ', len(y))
print('Test data count: ', len(y_test))

### 7.4) Define mean absolute percentage error function

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    
    mape = (abs((y_true - y_pred) / y_true)).mean() * 100
    
    return mape

### 7.5) Load and evaluate regression model

In [None]:
dt_reg = pickle.load(open('Desicion_tree_regression_for_diabetes_dataset.p', 'rb'))

y_pred = dt_reg.predict(X_test)

# Evaluation metrics
dt_reg_mse = metrics.mean_squared_error(y_test, y_pred)
dt_reg_mae = metrics.mean_absolute_error(y_test, y_pred)
dt_reg_mdae = metrics.median_absolute_error(y_test, y_pred)
dt_reg_mape = mean_absolute_percentage_error(y_test, y_pred)
dt_reg_r2 = metrics.r2_score(y_test, y_pred)

print('Mean Square Error: ', dt_reg_mse)
print('Mean Absolute Error: ', dt_reg_mae)
print('Median Absolute Error: ', dt_reg_mdae)
print('Mean Absolute Percentage Error: ', dt_reg_mape)
print('Coefficient of Determination (R^2): ', dt_reg_r2)

### 7.6) Perform k-fold cross-validation

In [None]:
k_fold = KFold(n_splits=5, shuffle=False)

mse_list = []

for train_indices, test_indices in k_fold.split(X):
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    dt_reg = DecisionTreeRegressor(max_depth=3, random_state=0)
    dt_reg.fit(X_train, y_train)
    
    y_pred = dt_reg.predict(X_test)
    
    dt_reg_mse = metrics.mean_squared_error(y_test, y_pred)
    print('Mean Square Error: ', dt_reg_mse)
    
    mse_list.append(dt_reg_mse)

print('Overall MSE from 5-fold cross-validation: ', sum(mse_list)/len(mse_list))

### 7.7) Load and split Wisconsin breast cancer classification dataset

In [None]:
X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=0)

### 7.8) Load and evaluate classification model

In [None]:
dt_clf = pickle.load(open('Desicion_tree_classification_for_breast_cancer_dataset.p', 'rb'))

y_pred = dt_clf.predict(X_test)

dt_clf_accuracy = metrics.accuracy_score(y_test, y_pred)
dt_clf_precision = metrics.precision_score(y_test, y_pred)
dt_clf_recall = metrics.recall_score(y_test, y_pred)
dt_clf_f1 = metrics.f1_score(y_test, y_pred)

print('Accuracy: ', dt_clf_accuracy)
print('Precision: ', dt_clf_precision)
print('Recall: ', dt_clf_recall)
print('F1 score: ', dt_clf_f1)

### 7.9) Analyse confusion matrix of classification model

In [None]:
dt_clf_confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

print(dt_clf_confusion_matrix)

### 7.10) Plot confusion matrix

In [None]:
plt.figure(figsize=(5, 4))
plt.imshow(dt_clf_confusion_matrix, cmap='gray', vmin=0)
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.xticks([0,1])
plt.yticks([0,1])
plt.colorbar(label='Instances')

for i in range(2):
    
    for j in range(2):
        
        plt.text(i, j, str(dt_clf_confusion_matrix[i,j]), 
                 color='k' if dt_clf_confusion_matrix[i,j] > 35 else 'w', 
                 fontsize=16, ha='center', va='center')

plt.show()

### 7.11) Plot learning curve

In [None]:
train_sizes, train_scores, test_scores = learning_curve(dt_clf, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5)

plt.figure(figsize=(7,5))

plt.plot(train_sizes, train_scores.mean(axis=1), color='r', label='Train')

plt.fill_between(train_sizes, train_scores.mean(axis=1) - train_scores.std(axis=1),
                train_scores.mean(axis=1) + train_scores.std(axis=1),
                color='r', alpha=0.3)

plt.plot(train_sizes, test_scores.mean(axis=1), color='g', label='Test')

plt.fill_between(train_sizes, test_scores.mean(axis=1) - test_scores.std(axis=1),
                test_scores.mean(axis=1) + test_scores.std(axis=1),
                color='g', alpha=0.3)

plt.title('Learning Curve')
plt.xlabel('Training Samples')
plt.ylabel('Accuracy Score')
plt.legend(loc='lower right')

plt.show()