# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time

# Load datasets
red = pd.read_csv('data/winequality-red.csv', sep=';')
white = pd.read_csv('data/winequality-white.csv', sep=';')

# Add wine_color column
red['wine_color'] = 'Red'
white['wine_color'] = 'White'

# Combine datasets
wine = pd.concat([red, white], axis=0)

# Encode categorical variable
le = LabelEncoder()
wine['wine_color'] = le.fit_transform(wine['wine_color'])

# Features and target
X = wine.drop('quality', axis=1)
y = wine['quality']

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Single Decision Tree
dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
start_time = time.time()
dt.fit(X_train, y_train)
dt_train_time = time.time() - start_time
dt_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)

# Random Forest with 25 trees
rf_25 = RandomForestClassifier(n_estimators=25, criterion='entropy', random_state=42, max_features='sqrt')
start_time = time.time()
rf_25.fit(X_train, y_train)
rf25_train_time = time.time() - start_time
rf25_pred = rf_25.predict(X_test)
rf25_accuracy = accuracy_score(y_test, rf25_pred)

# Random Forest with 50 trees
rf_50 = RandomForestClassifier(n_estimators=50, criterion='entropy', random_state=42, max_features='sqrt')
start_time = time.time()
rf_50.fit(X_train, y_train)
rf50_train_time = time.time() - start_time
rf50_pred = rf_50.predict(X_test)
rf50_accuracy = accuracy_score(y_test, rf50_pred)

print(f"Decision Tree Accuracy: {dt_accuracy:.4f}, Training Time: {dt_train_time:.4f} seconds")
print(f"Random Forest (25 trees) Accuracy: {rf25_accuracy:.4f}, Training Time: {rf25_train_time:.4f} seconds")
print(f"Random Forest (50 trees) Accuracy: {rf50_accuracy:.4f}, Training Time: {rf50_train_time:.4f} seconds")


Decision Tree Accuracy: 0.6108, Training Time: 0.0547 seconds
Random Forest (25 trees) Accuracy: 0.6692, Training Time: 0.3288 seconds
Random Forest (50 trees) Accuracy: 0.6746, Training Time: 0.5650 seconds


In [6]:
import numpy as np

# Function to evaluate individual trees
def evaluate_trees(rf_model, X_test, y_test):
    tree_accuracies = []
    for idx, tree in enumerate(rf_model.estimators_):
        pred = tree.predict(X_test)
        acc = accuracy_score(y_test, pred)
        tree_accuracies.append((idx, acc))
    return tree_accuracies

# Evaluate trees in Random Forest with 50 trees
rf50_accuracies = evaluate_trees(rf_50, X_test, y_test)

# Sort trees by accuracy
rf50_sorted = sorted(rf50_accuracies, key=lambda x: x[1], reverse=True)

# Top 10 trees
top_10 = rf50_sorted[:10]
# Bottom 10 trees
bottom_10 = rf50_sorted[-10:]

print("Top 10 Trees:")
for tree_id, acc in top_10:
    print(f"Tree {tree_id}: Accuracy = {acc:.4f}")

print("\nBottom 10 Trees:")
for tree_id, acc in bottom_10:
    print(f"Tree {tree_id}: Accuracy = {acc:.4f}")


Top 10 Trees:
Tree 29: Accuracy = 0.0131
Tree 44: Accuracy = 0.0123
Tree 10: Accuracy = 0.0115
Tree 34: Accuracy = 0.0115
Tree 8: Accuracy = 0.0108
Tree 12: Accuracy = 0.0108
Tree 35: Accuracy = 0.0092
Tree 7: Accuracy = 0.0085
Tree 13: Accuracy = 0.0085
Tree 16: Accuracy = 0.0085

Bottom 10 Trees:
Tree 11: Accuracy = 0.0046
Tree 19: Accuracy = 0.0046
Tree 23: Accuracy = 0.0046
Tree 48: Accuracy = 0.0046
Tree 22: Accuracy = 0.0038
Tree 27: Accuracy = 0.0038
Tree 47: Accuracy = 0.0038
Tree 32: Accuracy = 0.0031
Tree 36: Accuracy = 0.0031
Tree 30: Accuracy = 0.0023




In [7]:
from sklearn.tree import export_text

tree_structure = export_text(dt, feature_names=list(X.columns))
print(tree_structure)

|--- alcohol <= 10.62
|   |--- volatile acidity <= 0.24
|   |   |--- alcohol <= 8.95
|   |   |   |--- fixed acidity <= 6.85
|   |   |   |   |--- pH <= 3.24
|   |   |   |   |   |--- residual sugar <= 12.70
|   |   |   |   |   |   |--- fixed acidity <= 6.45
|   |   |   |   |   |   |   |--- class: 5
|   |   |   |   |   |   |--- fixed acidity >  6.45
|   |   |   |   |   |   |   |--- class: 6
|   |   |   |   |   |--- residual sugar >  12.70
|   |   |   |   |   |   |--- class: 6
|   |   |   |   |--- pH >  3.24
|   |   |   |   |   |--- sulphates <= 0.41
|   |   |   |   |   |   |--- density <= 1.00
|   |   |   |   |   |   |   |--- class: 6
|   |   |   |   |   |   |--- density >  1.00
|   |   |   |   |   |   |   |--- class: 5
|   |   |   |   |   |--- sulphates >  0.41
|   |   |   |   |   |   |--- free sulfur dioxide <= 22.00
|   |   |   |   |   |   |   |--- class: 7
|   |   |   |   |   |   |--- free sulfur dioxide >  22.00
|   |   |   |   |   |   |   |--- fixed acidity <= 4.35
|   |   |   |   |