<div align="center">
<h3> CS 178: Machine Learning & Data Mining </h3>
<h2> An Investigation of Classification Methods for Diabetes 130-US Hospitals </h2>
</div>

In [1]:
import numpy as np
import pandas as pd

from typing import List, Tuple
import matplotlib.pyplot as plt
import math

from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder

seed=1234
np.random.seed(seed)

# Import diabetes dataset
from ucimlrepo import fetch_ucirepo, list_available_datasets

# fetch dataset 
diabetes_df = fetch_ucirepo(id=296) 

# move data to features and target
diabetes_X = diabetes_df.data.features 
diabetes_y = diabetes_df.data.targets 
  
# metadata 
print(diabetes_df.metadata) 
  
# variable information 
print(diabetes_df.variables)

{'uci_id': 296, 'name': 'Diabetes 130-US Hospitals for Years 1999-2008', 'repository_url': 'https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008', 'data_url': 'https://archive.ics.uci.edu/static/public/296/data.csv', 'abstract': 'The dataset represents ten years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. Each row concerns hospital records of patients diagnosed with diabetes, who underwent laboratory, medications, and stayed up to 14 days. The goal is to determine the early readmission of the patient within 30 days of discharge.\nThe problem is important for the following reasons. Despite high-quality evidence showing improved clinical outcomes for diabetic patients who receive various preventive and therapeutic interventions, many patients do not receive them. This can be partially attributed to arbitrary diabetes management in hospital environments, which fail to attend to glycemic control. Failure to provide pro

  df = pd.read_csv(data_url)


We will do an analysis of different models in relation to the diabetes dataset.

In [2]:
# Count the number of NaN values in each column
print(diabetes_X.isnull().sum())

race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide                       0
glyburide     

We see that the columns for weight, payer_code, and medical_specialty have a significant number of null values while the rest of the features are fairly reliable. We will drop these columns, and drop all rows and columns that have null values afterwards.

In [3]:
all_columns = np.asarray(diabetes_X.columns)
# hard to represent diag_1, diag_2, and diag_3 as numeric values
# drop diag_1, diag_2, diag_3:
rid_columns = ['weight', 'payer_code', 'medical_specialty']
# rid_columns = ['weight', 'payer_code', 'medical_specialty']
diabetes_X_drop = diabetes_X.loc[:, [col for col in all_columns if col not in rid_columns]]

We can convert any non-numeric columns to numeric values.

In [4]:
encoder = LabelEncoder()

diags = ['diag_1', 'diag_2', 'diag_3']
for diag in diags:
    diabetes_X_drop[diag] = encoder.fit_transform(diabetes_X_drop[diag])

diabetes_X_drop['gender'] = encoder.fit_transform(diabetes_X_drop['gender'])

drugs = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

for drug in drugs:
    # # Encode values for drug use: 
    # diabetes_X_drop[drug] = encoder.fit_transform(diabetes_X_drop[drug])

    # OR make drug use binary value (no = 0, everything else = 1)
    diabetes_X_drop[drug] = [0 if y == 'NO' else 1 for y in diabetes_X_drop[drug]]

# Convert age values
age_mapping = {'[0-10)': 0, '[10-20)': 10, '[20-30)': 20, '[30-40)': 30, '[40-50)': 40, '[50-60)': 50, '[60-70)': 60, '[70-80)': 70, '[80-90)': 80, '[90-100)': 90}
diabetes_X_drop['age'] = diabetes_X_drop['age'].map(age_mapping)

# Convert change values
diabetes_X_drop['change'] = diabetes_X_drop['change'].map({'No': 0, 'Ch': 1})

# Convert diabetesMed values
diabetes_X_drop['diabetesMed'] = diabetes_X_drop['diabetesMed'].map({'No': 0, 'Yes': 1})

# Create multiple columns for races indicating binary values for yes or no
diabetes_X_drop = pd.get_dummies(diabetes_X_drop, columns=['race'], prefix='race')

# Create multiple columns for A1Cresult
diabetes_X_drop = pd.get_dummies(diabetes_X_drop, columns=['A1Cresult'], prefix='A1Cresult')

# Create multiple columns for max_glu_serum
diabetes_X_drop = pd.get_dummies(diabetes_X_drop, columns=['max_glu_serum'], prefix='max_glu_serum')

In [5]:
# What to do with NaN values - dropna(), replace(), or interpolate():

# Drop columns where the percentage of missing values is greater than the threshold
diabetes_clean_X = diabetes_X_drop.dropna()
diabetes_clean_y = diabetes_y.iloc[diabetes_clean_X.index]

In [6]:
print(diabetes_clean_X.shape)
print(diabetes_clean_y.shape)

# # Only use first 30,000 data points for testing:
# diabetes_sub_X = diabetes_clean_X.iloc[:30000]
# diabetes_sub_y = diabetes_clean_y.iloc[:30000]

# Use all data points for testing:
diabetes_sub_X = diabetes_clean_X
diabetes_sub_y = diabetes_clean_y

# Split data to 75% training data and 25% testing data
diabetes_X_tr, diabetes_X_te, diabetes_y_tr, diabetes_y_te = train_test_split(diabetes_sub_X, diabetes_sub_y, test_size=0.25, random_state=seed, shuffle=True)

print(diabetes_sub_X.shape)
print(diabetes_sub_y.shape)

(101766, 54)
(101766, 1)
(101766, 54)
(101766, 1)


### k-Nearest Neighbors (kNN)

Before Scaling:

In [None]:
figure, axes = plt.subplots(1, figsize=(6, 6))
k_values = [1, 5, 10, 20, 50, 100]

tr_error_rates = []
te_error_rates = []

diabetes_X_tr_arr = diabetes_X_tr.values
diabetes_y_tr_arr = diabetes_y_tr.values.ravel()
diabetes_X_te_arr = diabetes_X_te.values
diabetes_y_te_arr = diabetes_y_te.values.ravel()

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(diabetes_X_tr_arr, diabetes_y_tr_arr)
    tr_error_rates.append(1 - accuracy_score(diabetes_y_tr_arr, knn.predict(diabetes_X_tr_arr)))
    te_error_rates.append(1 - accuracy_score(diabetes_y_te_arr, knn.predict(diabetes_X_te_arr)))
axes.semilogx(k_values, tr_error_rates, label='Train')
axes.semilogx(k_values, te_error_rates, label='Test')
axes.set_xlabel('k')
axes.set_ylabel('Error Rate')
axes.set_title('Error Rates VS k')
axes.legend(fontsize=12)
axes.set_xticks(k_values)
axes.set_xticklabels(k_values)

plt.plot()

After Scaling:

In [None]:
figure, axes = plt.subplots(1, figsize=(6, 6))

scaler = StandardScaler()

tr_error_rates = []
te_error_rates = []

diabetes_X_tr_arr = scaler.fit_transform(diabetes_X_tr.values)
diabetes_y_tr_arr = diabetes_y_tr.values.ravel()
diabetes_X_te_arr = scaler.fit_transform(diabetes_X_te.values)
diabetes_y_te_arr = diabetes_y_te.values.ravel()

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(diabetes_X_tr_arr, diabetes_y_tr_arr)
    tr_error_rates.append(1 - accuracy_score(diabetes_y_tr_arr, knn.predict(diabetes_X_tr_arr)))
    te_error_rates.append(1 - accuracy_score(diabetes_y_te_arr, knn.predict(diabetes_X_te_arr)))
axes.semilogx(k_values, tr_error_rates, label='Train')
axes.semilogx(k_values, te_error_rates, label='Test')
axes.set_xlabel('k')
axes.set_ylabel('Error Rate')
axes.set_title('Error Rates VS k')
axes.legend(fontsize=12)
axes.set_xticks(k_values)
axes.set_xticklabels(k_values)

plt.plot()

### Gradient Descent

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Cost function
def compute_cost(X, y, theta):
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    cost = -1/m * (np.dot(y, np.log(h)) + np.dot((1-y), np.log(1-h)))
    return cost

# Gradient descent function
def gradient_descent(X, y, theta, learning_rate, num_iterations):
    m = len(y)
    cost_history = np.zeros(num_iterations)

    for i in range(num_iterations):
        h = sigmoid(np.dot(X, theta))
        gradient = np.dot(X.T, (h - y)) / m
        theta -= learning_rate * gradient
        cost_history[i] = compute_cost(X, y, theta)

    return theta, cost_history

# One-vs-Rest (OvR) training for multiclass
def one_vs_rest(X, y, num_classes, learning_rate, num_iterations):
    m, n = X.shape
    all_theta = np.zeros((num_classes, n))
    for i in range(num_classes):
        # Create binary labels for class i vs. all other classes
        y_binary = (y == i).astype(int)
        theta = np.zeros(n)
        theta, _ = gradient_descent(X, y_binary, theta, learning_rate, num_iterations)
        all_theta[i, :] = theta
    return all_theta

# Predict classes for multiclass
def predict_multiclass(X, all_theta):
    # Compute the probability for each class
    h = sigmoid(np.dot(X, all_theta.T))
    # Select the class with the highest probability
    return np.argmax(h, axis=1)

# Calculate accuracy
def calculate_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

In [None]:
# No penalty
y_1 = LabelEncoder().fit_transform(diabetes_y_tr_arr)
y_test_1 = LabelEncoder().fit_transform(diabetes_y_te_arr)
X_1 = scaler.fit_transform(diabetes_X_tr_arr)
X_1 = np.hstack((np.ones((X_1.shape[0], 1)), X_1))
X_test_1 = scaler.fit_transform(diabetes_X_te_arr)
X_test_1 = np.hstack((np.ones((X_test_1.shape[0], 1)), X_test_1))

num_classes = 3
learning_rates = [0.01, 0.02, 0.05, 0.1, 0.5]
num_iterations = 500

for rate in learning_rates:
    # Run gradient descent for every class
    all_theta_train = one_vs_rest(X_1, y_1, num_classes, rate, num_iterations)
    
    # Predict
    y_pred_train = predict_multiclass(X_1, all_theta_train)
    y_pred_test = predict_multiclass(X_test_1, all_theta_train)
    
    # Calculate accuracy
    train_accuracy = calculate_accuracy(y_1, y_pred_train)
    print(f"Train accuracy w/ learning rate {rate}:", train_accuracy)
    test_accuracy = calculate_accuracy(y_test_1, y_pred_test)
    print(f"Test accuracy w/ learning rate {rate}:", test_accuracy)

### Logistic Regression

In [None]:
# running with different solvers
# liblinear is only for binary classification
liblinear_classifier = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)
liblinear_classifier.fit(X_1, y_1)

train_predictions = liblinear_classifier.predict(X_1)
train_accuracy = np.mean(train_predictions == y_1)

print('\nTraining accuracy liblinear:',format( 100*train_accuracy , '.2f') ) 

# testing rest of classifiers but giving only 60% acc
lbfgs_classifier = LogisticRegression(penalty='l2', solver='lbfgs', fit_intercept=True)
lbfgs_classifier.fit(X_1, y_1)

train_predictions = lbfgs_classifier.predict(X_1)
train_accuracy = np.mean(train_predictions == y_1)

print('\nTraining accuracy lbfgs:',format( 100*train_accuracy , '.2f') ) 

penalties = ['l2', None]
solvers = ['lbfgs', 'saga', 'sag', 'newton-cg']


for solver in solvers:
    for penalty in penalties:
        classifier = LogisticRegression(penalty=penalty, solver=solver, fit_intercept=True, max_iter=1000)
        classifier.fit(X_1, y_1)
    
        train_predictions = classifier.predict(X_1)
        train_accuracy = np.mean(train_predictions == y_1)
    
        print('\nTraining accuracy with penalty {} and solver {}: {:.2f}%'.format(penalty, solver, 100 * train_accuracy))

### Forward Pass Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
def scale_data(X_tr: np.array, X_te: np.array) -> tuple[np.array, np.array]:
    ### YOUR CODE STARTS HERE ###
    scaler = StandardScaler()
    scaler.fit(X_tr)
    
    X_tr_scaled = scaler.transform(X_tr)
    X_te_scaled = scaler.transform(X_te)
    ###  YOUR CODE ENDS HERE  ###
    return X_tr_scaled, X_te_scaled

X_tr_scaled, X_te_scaled = scale_data(diabetes_X_tr.values, diabetes_X_te.values)

In [None]:
def errors_for_train_sizes_mlp(X_tr: np.array, y_tr: np.array, X_te: np.array, y_te: np.array, seed: int, train_sizes: list[int]) -> tuple[list, list, list, list]:
    # append error rates to the following lists
    tr_err_mlp = [] # training error rates for MLP
    te_err_mlp = [] # testing error rates for MLP

    ### YOUR CODE STARTS HERE ###
    for size in train_sizes:
        x_tr_sub = X_tr[:size]
        y_tr_sub = y_tr[:size]
        clf = MLPClassifier(hidden_layer_sizes=[64, 64], activation='relu', solver='sgd', learning_rate_init=0.001, batch_size=256, max_iter=1000, random_state=seed).fit(x_tr_sub, y_tr_sub)
        tr_err_mlp.append(1 - clf.score(x_tr_sub, y_tr_sub))
        te_err_mlp.append(1 - clf.score(X_te, y_te))
    
    ###  YOUR CODE ENDS HERE  ###
    return tr_err_mlp, te_err_mlp # DO NOT CHANGE THIS LINE

def errors_for_train_sizes_lr(X_tr: np.array, y_tr: np.array, X_te: np.array, y_te: np.array, seed: int, train_sizes: list[int]) -> tuple[list, list, list, list]:    
    # append error rates to the following lists
    tr_err_lr = [] # training error rates for Logistic Regression
    te_err_lr = [] # testing error rates for Logistic Regression
    
    ### YOUR CODE STARTS HERE ###
    for size in train_sizes:
        x_tr_sub = X_tr[:size]
        y_tr_sub = y_tr[:size]
        clf = LogisticRegression(random_state=seed).fit(x_tr_sub, y_tr_sub)
        tr_err_lr.append(1 - clf.score(x_tr_sub, y_tr_sub))
        te_err_lr.append(1 - clf.score(X_te, y_te))
        
    ###  YOUR CODE ENDS HERE  ###
    return tr_err_lr, te_err_lr # DO NOT CHANGE THIS LINE

In [None]:
def plot_errors_for_train_sizes_mlp_lr(tr_err_mlp: list, te_err_mlp: list, tr_err_lr: list, te_err_lr: list, train_sizes: list[int]) -> None:
    ### YOUR CODE STARTS HERE ###
    # Please use semilogx to plot
    plt.semilogx(train_sizes, tr_err_mlp, label="MLP Training")
    plt.semilogx(train_sizes, te_err_mlp, label="MLP Testing")
    plt.semilogx(train_sizes, tr_err_lr, label="LR Training")
    plt.semilogx(train_sizes, te_err_lr, label="LR Testing")

    plt.xlabel('Num. Training Data Points')
    plt.ylabel('Error Rate')

    plt.legend();
    ###  YOUR CODE ENDS HERE  ###

In [None]:
train_sizes = [50, 500, 2000, 5000, 10000, 15000, 25000]
tr_err_mlp, te_err_mlp = errors_for_train_sizes_mlp(X_tr_scaled, diabetes_y_tr_arr, X_te_scaled, diabetes_y_te_arr, seed, train_sizes)
tr_err_lr, te_err_lr = errors_for_train_sizes_lr(X_tr_scaled, diabetes_y_tr_arr, X_te_scaled, diabetes_y_te_arr, seed, train_sizes)
plot_errors_for_train_sizes_mlp_lr(tr_err_mlp, te_err_mlp, tr_err_lr, te_err_lr, train_sizes)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
depths = range(1, 16)
tr_scores = []
te_scores = []

for depth in depths:
    clf = DecisionTreeClassifier(max_depth=depth, random_state=seed)
    clf.fit(X_tr_scaled, diabetes_y_tr_arr)
    
    tr_scores.append(clf.score(X_tr_scaled, diabetes_y_tr_arr))
    te_scores.append(clf.score(X_te_scaled, diabetes_y_te_arr))

plt.figure(figsize=(10, 6))
plt.plot(depths, tr_scores, label="Training Accuracy")
plt.plot(depths, te_scores, label="Testing Accuracy")
plt.xlabel("Max Depth")
plt.ylabel("Accuracy")
plt.legend()
plt.xticks(depths)
plt.show()

In [None]:
leaves = range(1, 16)
tr_scores = []
te_scores = []

for leaf in leaves:
    clf = DecisionTreeClassifier(min_samples_leaf=leaf, random_state=seed)
    clf.fit(X_tr_scaled, diabetes_y_tr_arr)
    
    tr_scores.append(clf.score(X_tr_scaled, diabetes_y_tr_arr))
    te_scores.append(clf.score(X_te_scaled, diabetes_y_te_arr))

plt.figure(figsize=(10, 6))
plt.plot(leaves, tr_scores, label="Training Accuracy")
plt.plot(leaves, te_scores, label="Testing Accuracy")
plt.xlabel("Min Leaf Samples")
plt.ylabel("Accuracy")
plt.legend()
plt.xticks(leaves)
plt.show()


In [None]:
leaves = range(9, 17)
depths = range(2, 9)

scores = []

for leaf in leaves:
    for depth in depths:
        clf = DecisionTreeClassifier(min_samples_leaf=leaf, max_depth=depth, random_state=seed)
        clf.fit(X_tr_scaled, diabetes_y_tr_arr)

        tr_score = clf.score(X_tr_scaled, diabetes_y_tr_arr)
        te_score = clf.score(X_te_scaled, diabetes_y_te_arr)
        leaf_depth = [leaf, depth]
        scores.append([te_score, tr_score, leaf_depth])

scores = sorted(scores, reverse=True)[:5]
print(scores)

### Binary Class (Readmitted or not)

In [None]:
# Change training and testing class labels to binary classifications
diabetes_y_tr_arr = [0 if y == 'NO' else 1 for y in diabetes_y_tr_arr]
diabetes_y_te_arr = [0 if y == 'NO' else 1 for y in diabetes_y_te_arr]

### kNN (Binary Class)

In [None]:
figure, axes = plt.subplots(1, figsize=(6, 6))
k_values = [1, 5, 10, 20, 50, 100]

tr_error_rates = []
te_error_rates = []

diabetes_X_tr_arr = diabetes_X_tr.values
diabetes_X_te_arr = diabetes_X_te.values

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(diabetes_X_tr_arr, diabetes_y_tr_arr)
    tr_error_rates.append(1 - accuracy_score(diabetes_y_tr_arr, knn.predict(diabetes_X_tr_arr)))
    te_error_rates.append(1 - accuracy_score(diabetes_y_te_arr, knn.predict(diabetes_X_te_arr)))
axes.semilogx(k_values, tr_error_rates, label='Train')
axes.semilogx(k_values, te_error_rates, label='Test')
axes.set_xlabel('k')
axes.set_ylabel('Error Rate')
axes.set_title('Error Rates VS k')
axes.legend(fontsize=12)
axes.set_xticks(k_values)
axes.set_xticklabels(k_values)

plt.plot()

Scaled

In [None]:
figure, axes = plt.subplots(1, figsize=(6, 6))
k_values = [1, 5, 10, 20, 50, 100]

tr_error_rates = []
te_error_rates = []

diabetes_X_tr_arr_sc = scaler.fit_transform(diabetes_X_tr.values)
diabetes_X_te_arr_sc = scaler.fit_transform(diabetes_X_te.values)

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(diabetes_X_tr_arr_sc, diabetes_y_tr_arr)
    tr_error_rates.append(1 - accuracy_score(diabetes_y_tr_arr, knn.predict(diabetes_X_tr_arr_sc)))
    te_error_rates.append(1 - accuracy_score(diabetes_y_te_arr, knn.predict(diabetes_X_te_arr_sc)))
axes.semilogx(k_values, tr_error_rates, label='Train')
axes.semilogx(k_values, te_error_rates, label='Test')
axes.set_xlabel('k')
axes.set_ylabel('Error Rate')
axes.set_title('Error Rates VS k')
axes.legend(fontsize=12)
axes.set_xticks(k_values)
axes.set_xticklabels(k_values)

plt.plot()

### Gradient Descent (Binary Class)

In [None]:
# Define the loss function
def compute_loss(y, y_pred):
    y = np.array(y)
    m = len(y)
    loss = - (1/m) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
    return loss

# Define the gradient descent function
def gradient_descent(X, y, learning_rate=0.01, epochs=1000):
    m, n = X.shape
    weights = np.zeros(n)
    bias = 0
    losses = []
    
    for epoch in range(epochs):
        # Forward pass
        linear_model = np.dot(X, weights) + bias
        y_pred = sigmoid(linear_model)
        
        # Compute the loss
        loss = compute_loss(y, y_pred)
        losses.append(loss)
        
        # Compute gradients
        dw = (1/m) * np.dot(X.T, (y_pred - y))
        db = (1/m) * np.sum(y_pred - y)
        
        # Update weights and bias
        weights -= learning_rate * dw
        bias -= learning_rate * db
        
        if epoch % 100 == 0:
            print(f'Epoch {epoch}, Loss: {loss}')
    
    return weights, bias, losses

In [None]:
learning_rates = [0.01, 0.02, 0.05, 0.1, 0.5]
num_iterations = 500

for rate in learning_rates:
    weights, bias, losses = gradient_descent(X_1, diabetes_y_tr_arr, rate, num_iterations)
    
    linear_model_tr = np.dot(X_1, weights) + bias
    y_pred_tr = sigmoid(linear_model_tr)
    tr_predictions = np.round(y_pred_tr)
    linear_model_te = np.dot(X_test_1, weights) + bias
    y_pred_te = sigmoid(linear_model_te)
    te_predictions = np.round(y_pred_te)

    tr_accuracy = calculate_accuracy(diabetes_y_tr_arr, tr_predictions)
    print(f"Train accuracy w/ learning rate {rate}:", tr_accuracy)
    te_accuracy = calculate_accuracy(diabetes_y_te_arr, te_predictions)
    print(f"Test accuracy w/ learning rate {rate}:", te_accuracy)
    

### Decision Tree (Binary)

In [None]:
depths = range(1, 16)
tr_scores = []
te_scores = []

for depth in depths:
    clf = DecisionTreeClassifier(max_depth=depth, random_state=seed)
    clf.fit(X_tr_scaled, diabetes_y_tr_arr)
    
    tr_scores.append(clf.score(X_tr_scaled, diabetes_y_tr_arr))
    te_scores.append(clf.score(X_te_scaled, diabetes_y_te_arr))

plt.figure(figsize=(10, 6))
plt.plot(depths, tr_scores, label="Training Accuracy")
plt.plot(depths, te_scores, label="Testing Accuracy")
plt.xlabel("Max Depth")
plt.ylabel("Accuracy")
plt.legend()
plt.xticks(depths)
plt.show()

In [None]:
leaves = range(1, 16)
tr_scores = []
te_scores = []

for leaf in leaves:
    clf = DecisionTreeClassifier(min_samples_leaf=leaf, random_state=seed)
    clf.fit(X_tr_scaled, diabetes_y_tr_arr)
    
    tr_scores.append(clf.score(X_tr_scaled, diabetes_y_tr_arr))
    te_scores.append(clf.score(X_te_scaled, diabetes_y_te_arr))

plt.figure(figsize=(10, 6))
plt.plot(leaves, tr_scores, label="Training Accuracy")
plt.plot(leaves, te_scores, label="Testing Accuracy")
plt.xlabel("Min Leaf Samples")
plt.ylabel("Accuracy")
plt.legend()
plt.xticks(leaves)
plt.show()

In [None]:
leaves = range(9, 20)
depths = range(2, 20)

scores = []

for leaf in leaves:
    for depth in depths:
        clf = DecisionTreeClassifier(min_samples_leaf=leaf, max_depth=depth, random_state=seed)
        clf.fit(X_tr_scaled, diabetes_y_tr_arr)

        tr_score = clf.score(X_tr_scaled, diabetes_y_tr_arr)
        te_score = clf.score(X_te_scaled, diabetes_y_te_arr)
        leaf_depth = [leaf, depth]
        scores.append([te_score, tr_score, leaf_depth])

scores = sorted(scores, reverse=True)[:5]
print(scores)