In [2]:
import itertools
import numpy as np

class GridSearchCV:
    def __init__(self, estimator, param_grid, cv=5):
        self.estimator = estimator
        self.param_grid = param_grid
        self.cv = cv

    def fit(self, data, labels):
        combinations = list(itertools.product(*self.param_grid.values()))
        results = []
        for combination in combinations:
            params = dict(zip(self.param_grid.keys(), combination))
            estimator = self.estimator(**params)
            scores = self._cross_val_score(estimator, data, labels)
            results.append((scores.mean(), params))

        self.best_params = max(results, key=lambda x: x[0])[1]
        self.best_estimator = self.estimator(**self.best_params)
        self.best_estimator.fit(data, labels)

    def _cross_val_score(self, estimator, data, labels):
        n_samples = len(data)
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        fold_sizes = np.full(self.cv, n_samples // self.cv, dtype=np.int)
        fold_sizes[:n_samples % self.cv] += 1
        current = 0
        scores = []
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            test_indices = indices[start:stop]
            train_indices = np.concatenate((indices[:start], indices[stop:]))
            train_data, train_labels = data[train_indices], labels[train_indices]
            test_data, test_labels = data[test_indices], labels[test_indices]
            estimator.fit(train_data, train_labels)
            predictions = estimator.predict(test_data)
            score = self._score(predictions, test_labels)
            scores.append(score)
            current = stop
        return np.array(scores)

    def _score(self, predictions, labels):
        return -np.mean((predictions - labels) ** 2)

# example usage
data = np.array([[0, 1], [1, 3], [2, 2], [3, 5], [4, 4], [5, 6], [6, 7], [7, 8], [8, 9], [9, 10]])
labels = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
param_grid = {'max_depth': [None, 1, 2, 3, 4], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3], 'standard_deviation_reduction': [True, False]}
grid_search = GridSearchCV(DecisionTreeRegressor, param_grid)
grid_search.fit(data, labels)
print(grid_search.best_params)
print(grid_search.best_estimator.predict(data))


NameError: name 'DecisionTreeRegressor' is not defined

# 2nd 

In [7]:


class GridSearchCV:
    def __init__(self, estimator, param_grid, cv=5):
        self.estimator = estimator
        self.param_grid = param_grid
        self.cv = cv

    def fit(self, data, labels):
        best_score = -float('inf')
        for params in self._get_param_combinations():
            scores = []
            for i in range(self.cv):
                train_data, train_labels, val_data, val_labels = self._get_fold(data, labels, i)
                model = self.estimator(**params)
                model.fit(train_data, train_labels)
                score = self._score(model, val_data, val_labels)
                scores.append(score)
            mean_score = np.mean(scores)
            if mean_score > best_score:
                best_score = mean_score
                self.best_params = params
                self.best_estimator = self.estimator(**params)
        self.best_estimator.fit(data, labels)

    def predict(self, data):
        return self.best_estimator.predict(data)

    def _score(self, model, data, labels):
        preds = model.predict(data)
        return -np.mean((labels - preds)**2)

    def _get_fold(self, data, labels, i):
        n_samples = data.shape[0]
        fold_size = n_samples // self.cv
        start = i * fold_size
        end = (i + 1) * fold_size
        val_data = data[start:end]
        val_labels = labels[start:end]
        train_data = np.concatenate([data[:start], data[end:]], axis=0)
        train_labels = np.concatenate([labels[:start], labels[end:]], axis=0)
        return train_data, train_labels, val_data, val_labels

    def _get_param_combinations(self):
        param_combinations = []
        for key in self.param_grid:
            values = self.param_grid[key]
            param_combinations = [dict(zip(self.param_grid.keys(), v)) for v in itertools.product(values, repeat=len(self.param_grid))]
        return param_combinations


Now we can use GridSearchCV to find the best hyperparameters for our DecisionTreeRegressor:

In [8]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import itertools
from itertools import product

data, labels = load_boston(return_X_y=True)
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2)

param_grid = {
    'max_depth': [2, 4, 6, 8],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3],
    'standard_deviation_reduction': [True, False]
}

grid_search = GridSearchCV(DecisionTreeRegressor, param_grid)
grid_search.fit(train_data, train_labels)
print("Best hyperparameters:", grid_search.best_params)
print("Train score:", grid_search._score(grid_search.best_estimator, train_data, train_labels))
print("Test score:", grid_search._score(grid_search.best_estimator, test_data, test_labels))


TypeError: __init__() got an unexpected keyword argument 'standard_deviation_reduction'

# 3rd

In [5]:
from itertools import product

class GridSearchCV:
    def __init__(self, estimator, param_grid, cv=5):
        self.estimator = estimator
        self.param_grid = param_grid
        self.cv = cv

    def fit(self, data, labels):
        best_score = -float('inf')
        best_params = None
        for params in product(*self.param_grid.values()):
            params_dict = dict(zip(self.param_grid.keys(), params))
            estimator = self.estimator(**params_dict)
            scores = []
            indices = np.arange(len(data))
            np.random.shuffle(indices)
            fold_size = len(data) // self.cv
            for i in range(self.cv):
                val_indices = indices[i*fold_size:(i+1)*fold_size]
                train_indices = np.concatenate((indices[:i*fold_size], indices[(i+1)*fold_size:]))
                train_data, train_labels = data[train_indices], labels[train_indices]
                val_data, val_labels = data[val_indices], labels[val_indices]
                estimator.fit(train_data, train_labels)
                val_pred = estimator.predict(val_data)
                score = self._mean_squared_error(val_labels, val_pred)
                scores.append(score)
            avg_score = np.mean(scores)
            if avg_score > best_score:
                best_score = avg_score
                best_params = params_dict
        self.best_estimator_ = self.estimator(**best_params)
        self.best_params_ = best_params
        self.best_score_ = best_score

    def _mean_squared_error(self, y_true, y_pred):
        return np.mean((y_true - y_pred)**2)


Now, let's test our implementation on a sample dataset:

In [6]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# generate random regression dataset
data, labels = make_regression(n_samples=1000, n_features=10, noise=0.1)

# split dataset into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2)

# hyperparameter tuning
param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'standard_deviation_reduction': [True, False]
}
grid_search = GridSearchCV(DecisionTreeRegressor, param_grid)
grid_search.fit(train_data, train_labels)
print("Best parameters:", grid_search.best_params_)
print("Best score:", gridaz_search.best_score_)

# fit and predict
tree = grid_search.best_estimator_
tree.fit(train_data, train_labels)
pred = tree.predict(test_data)

# evaluate
mse = np.mean((test_labels - pred)**2)
print("Mean squared error:", mse)


NameError: name 'DecisionTreeRegressor' is not defined

# 4th 