In [5]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit

In [6]:
wine_data = load_wine()
X = wine_data.data
y = wine_data.target

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
}


In [9]:
decision_tree = DecisionTreeClassifier(random_state=42)
random_search = RandomizedSearchCV(decision_tree, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

In [10]:
best_params = random_search.best_params_

In [11]:
best_tree_model = random_search.best_estimator_
y_pred = best_tree_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [12]:
print(f"Best hyperparameters: {best_params}")
print(f"Accuracy on the test set: {accuracy * 100:.2f}%")


Best hyperparameters: {'criterion': 'gini', 'max_depth': 17, 'min_samples_leaf': 1, 'min_samples_split': 17}
Accuracy on the test set: 94.44%


### ShuffleSplit

In [13]:
n_subsets = 10
shuffle_split = ShuffleSplit(n_splits=n_subsets, test_size=0.2, random_state=42)

In [14]:
individual_trees = []

for train_index, _ in shuffle_split.split(X_train):
    X_subset, y_subset = X_train[train_index], y_train[train_index]
    subset_tree = DecisionTreeClassifier(**best_params)
    subset_tree.fit(X_subset, y_subset)
    individual_trees.append(subset_tree)

In [15]:
ensemble_predictions = np.zeros((X_test.shape[0], n_subsets))

In [16]:
for i, tree in enumerate(individual_trees):
    subset_predictions = tree.predict(X_test)
    ensemble_predictions[:, i] = subset_predictions

In [17]:
ensemble_predictions_majority = np.apply_along_axis(lambda x: np.bincount(x.astype(int)).argmax(), axis=1, arr=ensemble_predictions)


In [18]:
accuracy_ensemble = accuracy_score(y_test, ensemble_predictions_majority)

print(f"Accuracy of the Random Forest on the test set: {accuracy_ensemble * 100:.2f}%")


Accuracy of the Random Forest on the test set: 97.22%
