In [1]:
import numpy as np

class DecisionStumpClassifier:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.left_prediction = None
        self.right_prediction = None
    
    def fit(self, X, y):
        num_samples, num_features = X.shape
        best_error = float('inf')
        index = 0
        for feature_index in range(num_features):
            temp = X.iloc[:, feature_index]
            if temp.isnull().any():  # Sprawdza, czy w kolumnie występuje wartość null
                continue
            if isinstance(temp.iloc[0], str) or temp.iloc[0] is None:
                continue
            unique_values = np.unique(temp)
            thresholds = (unique_values[:-1] + unique_values[1:]) / 2.0
            for threshold in thresholds:
                prediction = np.where(X.iloc[:, feature_index] <= threshold, 1, 0)
                error = np.sum(prediction != y)

                if error < best_error:
                    best_error = error
                    self.feature_index = feature_index
                    self.threshold = threshold

                    left_indices = X.iloc[:, feature_index] <= threshold
                    right_indices = ~left_indices

                    self.left_prediction = self._majority_vote(y[left_indices])
                    self.right_prediction = self._majority_vote(y[right_indices])
    def _majority_vote(self, y):
        values, counts = np.unique(y, return_counts=True)
        return values[np.argmax(counts)]
    
    def predict(self, X):
        predictions = np.where(X.iloc[:, self.feature_index] <= self.threshold, self.left_prediction, self.right_prediction)
        return predictions

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier

from DecisionStumpClassifier import DecisionStumpClassifier 

auto_mpg = pd.read_parquet('./data/auto-mpg.parquet', engine='pyarrow')
autos = pd.read_parquet('./data/autos.parquet', engine='pyarrow')
hungarian_heart_disease = pd.read_parquet('./data/hungarian-heart-disease.parquet', engine='pyarrow')

datasets = {
    'auto-mpg': auto_mpg,
    #'autos': autos,
    'hungarian-heart-disease': hungarian_heart_disease
}

def evaluate_model(classifier, X, y):
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        scores.append(balanced_accuracy_score(y_test, y_pred))
    return np.mean(scores)

results = pd.DataFrame(columns=['DS', 'DT (max_depth=1)', 'DT'])

for name, dataset in datasets.items():
    X = dataset.drop(columns=['class'])
    y = dataset['class']
    results.loc[name] = [
        evaluate_model(DecisionStumpClassifier(), X, y),
        evaluate_model(DecisionTreeClassifier(max_depth=1), X, y),
        evaluate_model(DecisionTreeClassifier(), X, y)
    ]

results

Unnamed: 0,DS,DT (max_depth=1),DT
auto-mpg,0.333333,0.587671,0.721376
hungarian-heart-disease,0.5,0.758539,0.731008
