# Classification and Regression

In [32]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import sys
sys.path.append("..")
from decisiontree.DecisionTree import DecisionTreeClassification,DecisionTreeRegression
from sklearn.model_selection import GridSearchCV

In [16]:
# Load iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

parameters = {'min_samples_split':[2,5,10,20], 'max_depth':[1, 3, 5]}

# Loop all possible hyperparameter combinations to find the best combination
best_accuracy = 0
for min_samples_split in parameters['min_samples_split']:
    for max_depth in parameters['max_depth']:
        # Define and fit the decision tree classifier with current parameter values
        clf = DecisionTreeClassification(min_samples_split=min_samples_split, max_depth=max_depth)
        clf.fit(X_train, y_train)

        # Predict on test set and calculate accuracy
        y_pred = clf.predict(X_test)
        accuracy = np.mean(y_pred == y_test)

        # Check if current model is the best one, if yes, update the best value
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_parameters = {'min_samples_split': min_samples_split, 'max_depth': max_depth}

# Print the best hyperparameters and accuracy score
print(f"Best parameters: {best_parameters}")
print(f"Accuracy: {best_accuracy}")

# print(y_pred)
# print(y_test)

Best parameters: {'min_samples_split': 2, 'max_depth': 3}
Accuracy: 0.9733333333333334


In [19]:
# Load digits dataset
digits = load_digits()
X, y = digits.data, digits.target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

parameters = {'min_samples_split':[1, 2, 5], 'max_depth':[7, 10, 13]}

# Loop all possible hyperparameter combinations to find the best combination
best_accuracy = 0
for min_samples_split in parameters['min_samples_split']:
    for max_depth in parameters['max_depth']:
        # Define and fit the decision tree classifier with current parameter values
        clf = DecisionTreeClassification(min_samples_split=min_samples_split, max_depth=max_depth)
        clf.fit(X_train, y_train)

        # Predict on test set and calculate accuracy
        y_pred = clf.predict(X_test)
        accuracy = np.mean(y_pred == y_test)

        # Check if current model is the best one, if yes, update the best value
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_parameters = {'min_samples_split': min_samples_split, 'max_depth': max_depth}

# Print the best hyperparameters and accuracy score
print(f"Best parameters: {best_parameters}")
print(f"Accuracy: {best_accuracy}")

# print(y_pred)
# print(y_test)

Best parameters: {'min_samples_split': 2, 'max_depth': 10}
Accuracy: 0.8342602892102335


In [37]:
# Load housing dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

parameters = {'min_samples_split':[10,13,16], 'max_depth':[3, 5, 7]}
# get (13,5) as the best combination with smaller sample size
from tqdm import tqdm
# Loop all possible hyperparameter combinations to find the best combination
best_R2 = 0
for min_samples_split in tqdm(parameters['min_samples_split']):
    for max_depth in parameters['max_depth']:
        # Define and fit the decision tree classifier with current parameter values
        clf = DecisionTreeRegression(min_samples_split=min_samples_split, max_depth=max_depth)
        clf.fit(X_train, y_train)

        # Predict on test set and calculate accuracy
        y_pred = clf.predict(X_test)
        R2 =  r2_score(y_test, y_pred)

        # Check if current model is the best one, if yes, update the best value
        if R2 > best_R2:
            best_R2 = R2
            best_parameters = {'min_samples_split': min_samples_split, 'max_depth': max_depth}

# Print the best hyperparameters and accuracy score
print(f"Best parameters: {best_parameters}")
print(f"R2: {best_R2}")

# print(y_pred)
# print(y_test)

  0%|          | 0/3 [00:00<?, ?it/s]