In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Linear

In [None]:
import numpy as np
import pandas as pd

class LinearRegression:
    def __init__(self, fit_intercept=True):
        self.beta = None
        self.fit_intercept = fit_intercept
        self._is_fit = False

    def _add_intercept(self, X):
        return np.c_[np.ones(X.shape[0]), X] if self.fit_intercept else X

    def fit(self, X, y):
        if X.shape[0] != y.shape[0]:
            raise ValueError("The number of rows in X and y must match.")

        X = self._add_intercept(X)
        self.beta = np.linalg.pinv(X.T @ X) @ X.T @ y
        self._is_fit = True

    def predict(self, X):
        if not self._is_fit:
            raise RuntimeError("You must call `fit` before `predict`.")
        X = self._add_intercept(X)
        return X @ self.beta

    def evaluate(self, X, y):
        y_pred = self.predict(X)
        n, p = X.shape[0], X.shape[1] if self.fit_intercept else X.shape[1] - 1

        mae = np.mean(np.abs(y - y_pred))
        mse = np.mean((y - y_pred) ** 2)
        rmse = np.sqrt(mse)
        r2_square = 1 - (np.sum((y - y_pred) ** 2) / np.sum((y - np.mean(y)) ** 2))
        adj_rsquared = 1 - (1 - r2_square) * (n - 1) / (n - p - 1)

        metrics = pd.DataFrame({
            "Metric": ["MAE", "MSE", "RMSE", "R2 Square", "Adj R Square"],
            "Value": [mae, mse, rmse, r2_square, adj_rsquared]
        })
        return metrics


if __name__ == "__main__":
    np.random.seed(42)
    data_size = 100
    X = np.random.uniform(low=1.0, high=10.0, size=(data_size, 1))
    y = 5 * X[:, 0] + 10 + np.random.normal(0, 2, size=data_size)

    model = LinearRegression(fit_intercept=True)
    model.fit(X, y)

    y_pred = model.predict(X)

    print("Predicted Values:\n", y_pred)

    metrics = model.evaluate(X, y)
    print("\nModel Evaluation Metrics:")
    print(metrics)



Predicted Values:
 [31.94009062 57.33811013 47.69681726 41.81932982 22.30756475 22.30650152
 17.99054428 53.61162967 41.92761467 46.64235269 16.33756616 58.1842585
 52.12464147 24.79019662 23.44511934 23.51474629 28.84132837 38.56169382
 34.47052464 28.2677049  42.40094542 21.57914496 28.30806092 31.57958853
 35.53396407 50.04110481 24.23190319 38.09787959 41.5440952  17.47774758
 42.21104494 22.94697303 18.29769604 57.25749716 57.99569029 51.06471407
 28.85770542 19.7356234  45.59149445 34.83231387 20.80969306 37.2578151
 16.94605392 55.51344806 26.83733127 44.63447509 29.17055792 38.35502656
 39.52942939 23.57866055 58.16992246 49.59839873 56.84373176 54.87458777
 41.78589456 56.06682671 19.33098194 24.06920578 17.42383172 29.77090085
 32.56326353 27.3913807  51.96131759 31.15604076 27.81391292 39.3524818
 21.64219607 50.79139906 18.7164185  58.93261608 49.47109199 24.18966968
 15.67360974 51.37610193 46.58878462 47.56515892 49.428139   18.69411415
 31.23152421 20.53775301 53.4761827

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as SklearnLinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X, y = make_regression(n_samples=200, n_features=1, noise=10, random_state=42)
y = y.reshape(-1, 1) 

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

custom_model = LinearRegression()
custom_model.fit(X_train, y_train)  
custom_val_metrics = custom_model.evaluate(X_val, y_val)  
custom_test_metrics = custom_model.evaluate(X_test, y_test)  

sklearn_model = SklearnLinearRegression()
sklearn_model.fit(X_train, y_train) 
y_val_pred_sklearn = sklearn_model.predict(X_val)
y_test_pred_sklearn = sklearn_model.predict(X_test)

mae_val_sklearn = mean_absolute_error(y_val, y_val_pred_sklearn)
mse_val_sklearn = mean_squared_error(y_val, y_val_pred_sklearn)
rmse_val_sklearn = np.sqrt(mse_val_sklearn)
r2_val_sklearn = r2_score(y_val, y_val_pred_sklearn)
n, p = X_val.shape[0], X_val.shape[1]
adj_r2_val_sklearn = 1 - (1 - r2_val_sklearn) * (n - 1) / (n - p - 1)

mae_test_sklearn = mean_absolute_error(y_test, y_test_pred_sklearn)
mse_test_sklearn = mean_squared_error(y_test, y_test_pred_sklearn)
rmse_test_sklearn = np.sqrt(mse_test_sklearn)
r2_test_sklearn = r2_score(y_test, y_test_pred_sklearn)
n, p = X_test.shape[0], X_test.shape[1]
adj_r2_test_sklearn = 1 - (1 - r2_test_sklearn) * (n - 1) / (n - p - 1)

val_metrics_sklearn = pd.DataFrame({
    "Metric": ["MAE", "MSE", "RMSE", "R2 Square", "Adj R2"],
    "Value": [mae_val_sklearn, mse_val_sklearn, rmse_val_sklearn, r2_val_sklearn, adj_r2_val_sklearn]
})

test_metrics_sklearn = pd.DataFrame({
    "Metric": ["MAE", "MSE", "RMSE", "R2 Square", "Adj R2"],
    "Value": [mae_test_sklearn, mse_test_sklearn, rmse_test_sklearn, r2_test_sklearn, adj_r2_test_sklearn]
})

print("Validation Set Metrics Comparison:")
print(pd.concat([custom_val_metrics, val_metrics_sklearn], axis=1, keys=["Custom Model", "Sklearn Model"]))

print("\nTest Set Metrics Comparison:")
print(pd.concat([custom_test_metrics, test_metrics_sklearn], axis=1, keys=["Custom Model", "Sklearn Model"]))

Validation Set Metrics Comparison:
   Custom Model            Sklearn Model           
         Metric      Value        Metric      Value
0           MAE   6.523364           MAE   6.523364
1           MSE  78.319574           MSE  78.319574
2          RMSE   8.849835          RMSE   8.849835
3     R2 Square   0.991358     R2 Square   0.991358
4  Adj R Square   0.991131        Adj R2   0.991131

Test Set Metrics Comparison:
   Custom Model             Sklearn Model            
         Metric       Value        Metric       Value
0           MAE    9.448952           MAE    9.448952
1           MSE  125.890300           MSE  125.890300
2          RMSE   11.220085          RMSE   11.220085
3     R2 Square    0.983350     R2 Square    0.983350
4  Adj R Square    0.982911        Adj R2    0.982911


# Logistic

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, LeaveOneOut
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression

class LogisticRegression:
    def __init__(self, penalty="l2", gamma=0, fit_intercept=True):
        assert penalty in ["l2", "l1"], "penalty must be 'l1' or 'l2'"
        self.beta = None
        self.gamma = gamma
        self.penalty = penalty
        self.fit_intercept = fit_intercept

    def _add_intercept(self, X):
        return np.c_[np.ones(X.shape[0]), X] if self.fit_intercept else X

    def fit(self, X, y, lr=0.01, tol=1e-6, max_iter=1000):
        X = self._add_intercept(X)
        self.beta = np.random.randn(X.shape[1])
        prev_loss = float("inf")

        for _ in range(int(max_iter)):
            y_pred = _sigmoid(X @ self.beta)
            loss = self._compute_loss(y, y_pred)
            if abs(prev_loss - loss) < tol:
                break
            prev_loss = loss
            grad = self._compute_gradient(X, y, y_pred)
            self.beta -= lr * grad

    def _compute_loss(self, y, y_pred):
        N = len(y)
        if self.penalty == "l2":
            penalty = (self.gamma / 2) * np.sum(self.beta ** 2)
        elif self.penalty == "l1":
            penalty = self.gamma * np.sum(np.abs(self.beta))
        else:
            penalty = 0
        return -(np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)) / N) + penalty

    def _compute_gradient(self, X, y, y_pred):
        N = len(y)
        if self.penalty == "l2":
            penalty_grad = self.gamma * self.beta
        elif self.penalty == "l1":
            penalty_grad = self.gamma * np.sign(self.beta)
        else:
            penalty_grad = 0
        return -(X.T @ (y - y_pred)) / N + penalty_grad

    def predict_proba(self, X):
        X = self._add_intercept(X)
        return _sigmoid(X @ self.beta)

    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)

def _sigmoid(x):
    return 1 / (1 + np.exp(-x))

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)
    metrics = {
        "Accuracy": accuracy_score(y, y_pred),
        "Precision": precision_score(y, y_pred),
        "Recall": recall_score(y, y_pred),
        "F1 Score": f1_score(y, y_pred),
        "Log Loss": log_loss(y, y_proba)
    }
    return pd.DataFrame(metrics.items(), columns=["Metric", "Value"])

def cross_validate(model, X, y, cv="kfold", n_splits=5):
    if cv == "kfold":
        cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "stratified":
        cv_splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "loo":
        cv_splitter = LeaveOneOut()
    else:
        raise ValueError("Invalid cv type. Choose from 'kfold', 'stratified', 'loo'.")

    metrics_list = []
    for train_idx, val_idx in cv_splitter.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_val, y_val)
        metrics_list.append(metrics.set_index("Metric")["Value"])

    avg_metrics = pd.DataFrame(metrics_list).mean(axis=0)
    return avg_metrics


if __name__ == "__main__":
    np.random.seed(42)
    X = np.random.randn(500, 2)
    y = (X[:, 0] + X[:, 1] > 0).astype(int)

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    custom_model = LogisticRegression(penalty="l2", gamma=0.1)
    custom_model.fit(X_train, y_train)
    train_metrics_custom = evaluate_model(custom_model, X_train, y_train)
    val_metrics_custom = evaluate_model(custom_model, X_val, y_val)
    test_metrics_custom = evaluate_model(custom_model, X_test, y_test)

    sklearn_model = SklearnLogisticRegression(penalty="l2", C=1/0.1, solver="liblinear")
    sklearn_model.fit(X_train, y_train)
    train_metrics_sklearn = evaluate_model(sklearn_model, X_train, y_train)
    val_metrics_sklearn = evaluate_model(sklearn_model, X_val, y_val)
    test_metrics_sklearn = evaluate_model(sklearn_model, X_test, y_test)

    print("Train Metrics Comparison:")
    print(pd.concat([train_metrics_custom, train_metrics_sklearn], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nValidation Metrics Comparison:")
    print(pd.concat([val_metrics_custom, val_metrics_sklearn], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nTest Metrics Comparison:")
    print(pd.concat([test_metrics_custom, test_metrics_sklearn], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nCross-Validation Metrics Comparison:")
    custom_cv_metrics = cross_validate(custom_model, X_train, y_train, cv="kfold", n_splits=5)
    sklearn_cv_metrics = cross_validate(sklearn_model, X_train, y_train, cv="kfold", n_splits=5)
    print(pd.concat([custom_cv_metrics, sklearn_cv_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))



Train Metrics Comparison:
  Custom Model           Sklearn Model          
        Metric     Value        Metric     Value
0     Accuracy  0.983333      Accuracy  0.993333
1    Precision  0.967949     Precision  0.993377
2       Recall  1.000000        Recall  0.993377
3     F1 Score  0.983713      F1 Score  0.993377
4     Log Loss  0.369571      Log Loss  0.054250

Validation Metrics Comparison:
  Custom Model           Sklearn Model          
        Metric     Value        Metric     Value
0     Accuracy  0.970000      Accuracy  0.990000
1    Precision  0.940000     Precision  0.979167
2       Recall  1.000000        Recall  1.000000
3     F1 Score  0.969072      F1 Score  0.989474
4     Log Loss  0.329722      Log Loss  0.040553

Test Metrics Comparison:
  Custom Model           Sklearn Model          
        Metric     Value        Metric     Value
0     Accuracy  0.960000      Accuracy  1.000000
1    Precision  0.920000     Precision  1.000000
2       Recall  1.000000        Re

# DT

In [None]:
class ClassProbEstimator:
    def fit(self, X, y):
        self.class_prob = y.sum() / len(y)

    def predict(self, X):
        pred = np.empty(X.shape[0], dtype=np.float64)
        pred.fill(self.class_prob)
        return pred


class MeanBaseEstimator:
    def fit(self, X, y):
        self.avg = np.mean(y)

    def predict(self, X):
        pred = np.empty(X.shape[0], dtype=np.float64)
        pred.fill(self.avg)
        return pred

class MSELoss:
    def __call__(self, y, y_pred):
        return np.mean((y - y_pred) ** 2)

    def base_estimator(self):
        return MeanBaseEstimator()

    def grad(self, y, y_pred):
        return -2 / len(y) * (y - y_pred)

    def line_search(self, y, y_pred, h_pred):
        # TODO: revise this
        Lp = np.sum((y - y_pred) * h_pred)
        Lpp = np.sum(h_pred * h_pred)

        return 1 if np.sum(Lpp) == 0 else Lp / Lpp


class CrossEntropyLoss:
    def __call__(self, y, y_pred):
        eps = np.finfo(float).eps
        return -np.sum(y * np.log(y_pred + eps))

    def base_estimator(self):
        return ClassProbEstimator()

    def grad(self, y, y_pred):
        eps = np.finfo(float).eps
        return -y * 1 / (y_pred + eps)

    def line_search(self, y, y_pred, h_pred):
        raise NotImplementedError

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, LeaveOneOut
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

class Node:
    def __init__(self, left, right, rule):
        self.left = left
        self.right = right
        self.feature = rule[0]
        self.threshold = rule[1]

class Leaf:
    def __init__(self, value):
        self.value = value 

class DecisionTree:
    def __init__(self, classifier=True, max_depth=None, n_feats=None, criterion="entropy", seed=None):
        self.classifier = classifier
        self.max_depth = max_depth if max_depth else np.inf
        self.n_feats = n_feats
        self.criterion = criterion
        self.root = None
        self.depth = 0
        np.random.seed(seed)

        if not classifier and criterion not in ["mse"]:
            raise ValueError("`mse` is the only valid criterion for regression.")
        if classifier and criterion not in ["entropy", "gini"]:
            raise ValueError("Valid criteria for classification are 'entropy' and 'gini'.")

    def fit(self, X, Y):
        self.n_classes = max(Y) + 1 if self.classifier else None
        self.n_feats = X.shape[1] if self.n_feats is None else min(self.n_feats, X.shape[1])
        self.root = self._grow(X, Y)

    def predict(self, X):
        return np.array([self._traverse(x, self.root) for x in X])

    def _grow(self, X, Y, cur_depth=0):
        if len(set(Y)) == 1:
            return Leaf(self._leaf_value(Y))

        if cur_depth >= self.max_depth:
            return Leaf(self._leaf_value(Y))

        cur_depth += 1
        self.depth = max(self.depth, cur_depth)

        feat_idxs = np.random.choice(X.shape[1], self.n_feats, replace=False)

        feat, thresh = self._segment(X, Y, feat_idxs)
        l_idx = np.argwhere(X[:, feat] <= thresh).flatten()
        r_idx = np.argwhere(X[:, feat] > thresh).flatten()

        left = self._grow(X[l_idx], Y[l_idx], cur_depth)
        right = self._grow(X[r_idx], Y[r_idx], cur_depth)
        return Node(left, right, (feat, thresh))

    def _segment(self, X, Y, feat_idxs):
        best_gain = -np.inf
        split_idx, split_thresh = None, None

        for i in feat_idxs:
            thresholds = np.unique(X[:, i])
            for thresh in thresholds:
                gain = self._impurity_gain(Y, X[:, i], thresh)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = i
                    split_thresh = thresh

        return split_idx, split_thresh

    def _impurity_gain(self, Y, feat_values, thresh):
        if self.criterion == "entropy":
            impurity = entropy
        elif self.criterion == "gini":
            impurity = gini
        elif self.criterion == "mse":
            impurity = mse

        parent_loss = impurity(Y)
        left_idx = feat_values <= thresh
        right_idx = feat_values > thresh

        if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
            return 0

        left_loss = impurity(Y[left_idx])
        right_loss = impurity(Y[right_idx])
        n = len(Y)
        n_l, n_r = np.sum(left_idx), np.sum(right_idx)

        return parent_loss - (n_l / n) * left_loss - (n_r / n) * right_loss

    def _leaf_value(self, Y):
        if self.classifier:
            probs = np.bincount(Y, minlength=self.n_classes) / len(Y)
            return probs
        return np.mean(Y)

    def _traverse(self, x, node):
        if isinstance(node, Leaf):
            if self.classifier:
                return node.value.argmax()
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse(x, node.left)
        return self._traverse(x, node.right)


def entropy(Y):
    ps = np.bincount(Y) / len(Y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

def gini(Y):
    ps = np.bincount(Y) / len(Y)
    return 1 - np.sum(ps ** 2)

def mse(Y):
    return np.mean((Y - np.mean(Y)) ** 2)


def evaluate_model(model, X, y, classifier=True):
    y_pred = model.predict(X)

    if classifier:  
        metrics = {
            "Accuracy": accuracy_score(y, y_pred),
            "Precision": precision_score(y, y_pred, average="weighted"),
            "Recall": recall_score(y, y_pred, average="weighted"),
            "F1 Score": f1_score(y, y_pred, average="weighted"),
        }
    else:  
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        adj_r2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - X.shape[1] - 1)
        metrics = {
            "MAE": mean_absolute_error(y, y_pred),
            "MSE": mse,
            "RMSE": np.sqrt(mse),
            "R2 Square": r2,
            "Adj R Square": adj_r2,
        }

    return pd.DataFrame(metrics.items(), columns=["Metric", "Value"])

def cross_validate(model, X, y, cv="kfold", n_splits=5, classifier=True):
    if cv == "kfold":
        cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "stratified":
        if not classifier:
            raise ValueError("Stratified K-Fold 仅适用于分类问题。")
        cv_splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "loo":
        cv_splitter = LeaveOneOut()
    else:
        raise ValueError("Invalid cv type. Choose from 'kfold', 'stratified', 'loo'.")

    metrics_list = []
    for train_idx, val_idx in cv_splitter.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_val, y_val, classifier=classifier)
        metrics_list.append(metrics.set_index("Metric")["Value"])

    avg_metrics = pd.DataFrame(metrics_list).mean(axis=0)
    return avg_metrics

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

np.random.seed(42)
X = np.random.rand(200, 2) * 10
y = 3 * X[:, 0] + 2 * X[:, 1] + np.random.randn(200)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

custom_model = DecisionTreeRegressor(max_depth=5, random_state=42)
custom_model.fit(X_train, y_train)

sklearn_model = DecisionTreeRegressor(max_depth=5, random_state=42)
sklearn_model.fit(X_train, y_train)

custom_train_metrics = evaluate_model(custom_model, X_train, y_train, classifier=False)
custom_val_metrics = evaluate_model(custom_model, X_val, y_val, classifier=False)
custom_test_metrics = evaluate_model(custom_model, X_test, y_test, classifier=False)

sklearn_train_metrics = evaluate_model(sklearn_model, X_train, y_train, classifier=False)
sklearn_val_metrics = evaluate_model(sklearn_model, X_val, y_val, classifier=False)
sklearn_test_metrics = evaluate_model(sklearn_model, X_test, y_test, classifier=False)

print("Train Metrics Comparison:")
print(pd.concat([custom_train_metrics, sklearn_train_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

print("\nValidation Metrics Comparison:")
print(pd.concat([custom_val_metrics, sklearn_val_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

print("\nTest Metrics Comparison:")
print(pd.concat([custom_test_metrics, sklearn_test_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

custom_cv_metrics = cross_validate(custom_model, X, y, cv="kfold", n_splits=5, classifier=False)
sklearn_cv_metrics = cross_validate(sklearn_model, X, y, cv="kfold", n_splits=5, classifier=False)

print("\nCross-Validation Metrics Comparison:")
print(pd.concat([custom_cv_metrics, sklearn_cv_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))


Train Metrics Comparison:
   Custom Model           Sklearn Model          
         Metric     Value        Metric     Value
0           MAE  1.110582           MAE  1.110582
1           MSE  2.496919           MSE  2.496919
2          RMSE  1.580164          RMSE  1.580164
3     R2 Square  0.979766     R2 Square  0.979766
4  Adj R Square  0.979420  Adj R Square  0.979420

Validation Metrics Comparison:
   Custom Model           Sklearn Model          
         Metric     Value        Metric     Value
0           MAE  2.302911           MAE  2.302911
1           MSE  9.020365           MSE  9.020365
2          RMSE  3.003392          RMSE  3.003392
3     R2 Square  0.925768     R2 Square  0.925768
4  Adj R Square  0.921756  Adj R Square  0.921756

Test Metrics Comparison:
   Custom Model            Sklearn Model           
         Metric      Value        Metric      Value
0           MAE   2.541860           MAE   2.541860
1           MSE  10.888125           MSE  10.888125
2       

## RF

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, LeaveOneOut
import pandas as pd



def bootstrap_sample(X, Y):
    N, M = X.shape
    idxs = np.random.choice(N, N, replace=True)
    return X[idxs], Y[idxs]


class RandomForest:
    def __init__(self, n_trees, max_depth, n_feats, classifier=True, criterion="entropy"):
        self.trees = []
        self.n_trees = n_trees
        self.n_feats = n_feats
        self.max_depth = max_depth
        self.criterion = criterion
        self.classifier = classifier

    def fit(self, X, Y):
        self.trees = []
        for _ in range(self.n_trees):
            X_samp, Y_samp = bootstrap_sample(X, Y)
            tree = DecisionTree(
                n_feats=self.n_feats,
                max_depth=self.max_depth,
                criterion=self.criterion,
                classifier=self.classifier,
            )
            tree.fit(X_samp, Y_samp)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([[t._traverse(x, t.root) for x in X] for t in self.trees])
        return self._vote(tree_preds)

    def _vote(self, predictions):
        if self.classifier:
            out = [np.bincount(x).argmax() for x in predictions.T]
        else:
            out = [np.mean(x) for x in predictions.T]
        return np.array(out)

def evaluate_model(model, X, y, classifier=True):
    y_pred = model.predict(X)

    if classifier:  
        metrics = {
            "Accuracy": accuracy_score(y, y_pred),
            "Precision": precision_score(y, y_pred, average="weighted"),
            "Recall": recall_score(y, y_pred, average="weighted"),
            "F1 Score": f1_score(y, y_pred, average="weighted"),
        }
    else:  
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        adj_r2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - X.shape[1] - 1)
        metrics = {
            "MAE": mean_absolute_error(y, y_pred),
            "MSE": mse,
            "RMSE": np.sqrt(mse),
            "R2 Square": r2,
            "Adj R Square": adj_r2,
        }

    return pd.DataFrame(metrics.items(), columns=["Metric", "Value"])

def cross_validate(model, X, y, cv="kfold", n_splits=5, classifier=True):
    if cv == "kfold":
        cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "stratified":
        if not classifier:
            raise ValueError("Stratified K-Fold only works for classification tasks.")
        cv_splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "loo":
        cv_splitter = LeaveOneOut()
    else:
        raise ValueError("Invalid cv type. Choose from 'kfold', 'stratified', 'loo'.")

    metrics_list = []
    for train_idx, val_idx in cv_splitter.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_val, y_val, classifier=classifier)
        metrics_list.append(metrics.set_index("Metric")["Value"])

    avg_metrics = pd.DataFrame(metrics_list).mean(axis=0)
    return avg_metrics

if __name__ == "__main__":
    np.random.seed(42)
    X_class = np.random.randn(200, 2)
    y_class = (X_class[:, 0] + X_class[:, 1] > 0).astype(int)

    X_train, X_temp, y_train, y_temp = train_test_split(X_class, y_class, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    custom_rf = RandomForest(n_trees=10, max_depth=5, n_feats=2, classifier=True, criterion="gini")
    custom_rf.fit(X_train, y_train)

    sklearn_rf = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=42)
    sklearn_rf.fit(X_train, y_train)

    custom_train_metrics = evaluate_model(custom_rf, X_train, y_train, classifier=True)
    custom_val_metrics = evaluate_model(custom_rf, X_val, y_val, classifier=True)
    custom_test_metrics = evaluate_model(custom_rf, X_test, y_test, classifier=True)

    sklearn_train_metrics = evaluate_model(sklearn_rf, X_train, y_train, classifier=True)
    sklearn_val_metrics = evaluate_model(sklearn_rf, X_val, y_val, classifier=True)
    sklearn_test_metrics = evaluate_model(sklearn_rf, X_test, y_test, classifier=True)

    print("Train Metrics Comparison:")
    print(pd.concat([custom_train_metrics, sklearn_train_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nValidation Metrics Comparison:")
    print(pd.concat([custom_val_metrics, sklearn_val_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nTest Metrics Comparison:")
    print(pd.concat([custom_test_metrics, sklearn_test_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    custom_cv_metrics = cross_validate(custom_rf, X_class, y_class, cv="stratified", n_splits=5, classifier=True)
    sklearn_cv_metrics = cross_validate(sklearn_rf, X_class, y_class, cv="stratified", n_splits=5, classifier=True)

    print("\nCross-Validation Metrics Comparison:")
    print(pd.concat([custom_cv_metrics, sklearn_cv_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    X_reg = np.random.rand(200, 2) * 10
    y_reg = 3 * X_reg[:, 0] + 2 * X_reg[:, 1] + np.random.randn(200)

    X_train, X_temp, y_train, y_temp = train_test_split(X_reg, y_reg, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    custom_rf = RandomForest(n_trees=10, max_depth=5, n_feats=2, classifier=False, criterion="mse")
    custom_rf.fit(X_train, y_train)

    sklearn_rf = RandomForestRegressor(n_estimators=10, max_depth=5, random_state=42)
    sklearn_rf.fit(X_train, y_train)

    custom_train_metrics = evaluate_model(custom_rf, X_train, y_train, classifier=False)
    custom_val_metrics = evaluate_model(custom_rf, X_val, y_val, classifier=False)
    custom_test_metrics = evaluate_model(custom_rf, X_test, y_test, classifier=False)

    sklearn_train_metrics = evaluate_model(sklearn_rf, X_train, y_train, classifier=False)
    sklearn_val_metrics = evaluate_model(sklearn_rf, X_val, y_val, classifier=False)
    sklearn_test_metrics = evaluate_model(sklearn_rf, X_test, y_test, classifier=False)

    print("\nRegression Train Metrics Comparison:")
    print(pd.concat([custom_train_metrics, sklearn_train_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nRegression Validation Metrics Comparison:")
    print(pd.concat([custom_val_metrics, sklearn_val_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nRegression Test Metrics Comparison:")
    print(pd.concat([custom_test_metrics, sklearn_test_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    custom_cv_metrics = cross_validate(custom_rf, X_reg, y_reg, cv="kfold", n_splits=5, classifier=False)
    sklearn_cv_metrics = cross_validate(sklearn_rf, X_reg, y_reg, cv="kfold", n_splits=5, classifier=False)

    print("\nRegression Cross-Validation Metrics Comparison:")
    print(pd.concat([custom_cv_metrics, sklearn_cv_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))


Train Metrics Comparison:
  Custom Model       Sklearn Model          
        Metric Value        Metric     Value
0     Accuracy   1.0      Accuracy  0.991667
1    Precision   1.0     Precision  0.991821
2       Recall   1.0        Recall  0.991667
3     F1 Score   1.0      F1 Score  0.991674

Validation Metrics Comparison:
  Custom Model       Sklearn Model          
        Metric Value        Metric     Value
0     Accuracy   0.9      Accuracy  0.925000
1    Precision   0.9     Precision  0.925333
2       Recall   0.9        Recall  0.925000
3     F1 Score   0.9      F1 Score  0.924556

Test Metrics Comparison:
  Custom Model           Sklearn Model          
        Metric     Value        Metric     Value
0     Accuracy  0.975000      Accuracy  0.975000
1    Precision  0.976389     Precision  0.976389
2       Recall  0.975000        Recall  0.975000
3     F1 Score  0.975079      F1 Score  0.975079

Cross-Validation Metrics Comparison:
           Custom Model  Sklearn Model
Metri

## GBDT

In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, mean_absolute_error, r2_score
)
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, LeaveOneOut
import pandas as pd



def to_one_hot(labels, n_classes=None):
    if labels.ndim > 1:
        raise ValueError("labels must have dimension 1, but got {}".format(labels.ndim))

    N = labels.size
    n_cols = np.max(labels) + 1 if n_classes is None else n_classes
    one_hot = np.zeros((N, n_cols))
    one_hot[np.arange(N), labels] = 1.0
    return one_hot


class GradientBoostedDecisionTree:
    def __init__(self, n_iter, max_depth=None, classifier=True, learning_rate=1, loss="crossentropy", step_size="constant"):
        self.loss = loss
        self.weights = None
        self.learners = None
        self.out_dims = None
        self.n_iter = n_iter
        self.max_depth = max_depth
        self.step_size = step_size
        self.classifier = classifier
        self.learning_rate = learning_rate

    def fit(self, X, Y):
        if self.loss == "mse":
            loss = MSELoss()
        elif self.loss == "crossentropy":
            loss = CrossEntropyLoss()
        else:
            raise ValueError("Unsupported loss function: {}".format(self.loss))

        if self.classifier:
            Y = to_one_hot(Y.flatten())
        else:
            Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y

        N, M = X.shape
        self.out_dims = Y.shape[1]
        self.learners = np.empty((self.n_iter, self.out_dims), dtype=object)
        self.weights = np.ones((self.n_iter, self.out_dims))
        self.weights[1:, :] *= self.learning_rate

        Y_pred = np.zeros((N, self.out_dims))
        for k in range(self.out_dims):
            t = loss.base_estimator()
            t.fit(X, Y[:, k])
            Y_pred[:, k] += t.predict(X)
            self.learners[0, k] = t

        for i in range(1, self.n_iter):
            for k in range(self.out_dims):
                y, y_pred = Y[:, k], Y_pred[:, k]
                neg_grad = -1 * loss.grad(y, y_pred)

                t = DecisionTree(classifier=False, max_depth=self.max_depth, criterion="mse")
                t.fit(X, neg_grad)
                self.learners[i, k] = t

                step = 1.0
                h_pred = t.predict(X)
                if self.step_size == "adaptive":
                    step = loss.line_search(y, y_pred, h_pred)

                self.weights[i, k] *= step
                Y_pred[:, k] += self.weights[i, k] * h_pred

    def predict(self, X):
        Y_pred = np.zeros((X.shape[0], self.out_dims))
        for i in range(self.n_iter):
            for k in range(self.out_dims):
                Y_pred[:, k] += self.weights[i, k] * self.learners[i, k].predict(X)

        if self.classifier:
            Y_pred = Y_pred.argmax(axis=1)

        return Y_pred


def evaluate_model(model, X, y, classifier=True):
    y_pred = model.predict(X)

    if classifier:  
        metrics = {
            "Accuracy": accuracy_score(y, y_pred),
            "Precision": precision_score(y, y_pred, average="weighted"),
            "Recall": recall_score(y, y_pred, average="weighted"),
            "F1 Score": f1_score(y, y_pred, average="weighted"),
        }
    else:  
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        adj_r2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - X.shape[1] - 1)
        metrics = {
            "MAE": mean_absolute_error(y, y_pred),
            "MSE": mse,
            "RMSE": np.sqrt(mse),
            "R2 Square": r2,
            "Adj R Square": adj_r2,
        }

    return pd.DataFrame(metrics.items(), columns=["Metric", "Value"])


def cross_validate(model, X, y, cv="kfold", n_splits=5, classifier=True):
    if cv == "kfold":
        cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "stratified":
        if not classifier:
            raise ValueError("Stratified K-Fold only works for classification tasks.")
        cv_splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "loo":
        cv_splitter = LeaveOneOut()
    else:
        raise ValueError("Invalid cv type. Choose from 'kfold', 'stratified', 'loo'.")

    metrics_list = []
    for train_idx, val_idx in cv_splitter.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_val, y_val, classifier=classifier)
        metrics_list.append(metrics.set_index("Metric")["Value"])

    avg_metrics = pd.DataFrame(metrics_list).mean(axis=0)
    return avg_metrics


if __name__ == "__main__":
    np.random.seed(42)
    X_class = np.random.randn(200, 2)
    y_class = (X_class[:, 0] + X_class[:, 1] > 0).astype(int)

    X_train, X_temp, y_train, y_temp = train_test_split(X_class, y_class, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    custom_gbm = GradientBoostedDecisionTree(
        n_iter=10, max_depth=3, classifier=True, learning_rate=0.1, loss="crossentropy"
    )
    custom_gbm.fit(X_train, y_train)

    sklearn_gbm = GradientBoostingClassifier(
        n_estimators=10, max_depth=3, learning_rate=0.1, random_state=42
    )
    sklearn_gbm.fit(X_train, y_train)

    custom_train_metrics = evaluate_model(custom_gbm, X_train, y_train, classifier=True)
    custom_val_metrics = evaluate_model(custom_gbm, X_val, y_val, classifier=True)
    custom_test_metrics = evaluate_model(custom_gbm, X_test, y_test, classifier=True)

    sklearn_train_metrics = evaluate_model(sklearn_gbm, X_train, y_train, classifier=True)
    sklearn_val_metrics = evaluate_model(sklearn_gbm, X_val, y_val, classifier=True)
    sklearn_test_metrics = evaluate_model(sklearn_gbm, X_test, y_test, classifier=True)

    print("Classification Train Metrics Comparison:")
    print(pd.concat([custom_train_metrics, sklearn_train_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nClassification Validation Metrics Comparison:")
    print(pd.concat([custom_val_metrics, sklearn_val_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nClassification Test Metrics Comparison:")
    print(pd.concat([custom_test_metrics, sklearn_test_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    custom_cv_metrics = cross_validate(custom_gbm, X_class, y_class, cv="stratified", n_splits=5, classifier=True)
    sklearn_cv_metrics = cross_validate(sklearn_gbm, X_class, y_class, cv="stratified", n_splits=5, classifier=True)

    print("\nClassification Cross-Validation Metrics Comparison:")
    print(pd.concat([custom_cv_metrics, sklearn_cv_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    X_reg = np.random.rand(200, 2) * 10
    y_reg = 3 * X_reg[:, 0] + 2 * X_reg[:, 1] + np.random.randn(200)

    X_train, X_temp, y_train, y_temp = train_test_split(X_reg, y_reg, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    custom_gbm = GradientBoostedDecisionTree(
        n_iter=10, max_depth=3, classifier=False, learning_rate=0.1, loss="mse"
    )
    custom_gbm.fit(X_train, y_train)

    sklearn_gbm = GradientBoostingRegressor(
        n_estimators=10, max_depth=3, learning_rate=0.1, random_state=42
    )
    sklearn_gbm.fit(X_train, y_train)

    custom_train_metrics = evaluate_model(custom_gbm, X_train, y_train, classifier=False)
    custom_val_metrics = evaluate_model(custom_gbm, X_val, y_val, classifier=False)
    custom_test_metrics = evaluate_model(custom_gbm, X_test, y_test, classifier=False)

    sklearn_train_metrics = evaluate_model(sklearn_gbm, X_train, y_train, classifier=False)
    sklearn_val_metrics = evaluate_model(sklearn_gbm, X_val, y_val, classifier=False)
    sklearn_test_metrics = evaluate_model(sklearn_gbm, X_test, y_test, classifier=False)

    print("\nRegression Train Metrics Comparison:")
    print(pd.concat([custom_train_metrics, sklearn_train_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nRegression Validation Metrics Comparison:")
    print(pd.concat([custom_val_metrics, sklearn_val_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    print("\nRegression Test Metrics Comparison:")
    print(pd.concat([custom_test_metrics, sklearn_test_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    custom_cv_metrics = cross_validate(custom_gbm, X_reg, y_reg, cv="kfold", n_splits=5, classifier=False)
    sklearn_cv_metrics = cross_validate(sklearn_gbm, X_reg, y_reg, cv="kfold", n_splits=5, classifier=False)

    print("\nRegression Cross-Validation Metrics Comparison:")
    print(pd.concat([custom_cv_metrics, sklearn_cv_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))


Classification Train Metrics Comparison:
  Custom Model           Sklearn Model          
        Metric     Value        Metric     Value
0     Accuracy  0.983333      Accuracy  0.991667
1    Precision  0.983816     Precision  0.991789
2       Recall  0.983333        Recall  0.991667
3     F1 Score  0.983296      F1 Score  0.991658

Classification Validation Metrics Comparison:
  Custom Model       Sklearn Model          
        Metric Value        Metric     Value
0     Accuracy   0.9      Accuracy  0.900000
1    Precision   0.9     Precision  0.902198
2       Recall   0.9        Recall  0.900000
3     F1 Score   0.9      F1 Score  0.898667

Classification Test Metrics Comparison:
  Custom Model           Sklearn Model      
        Metric     Value        Metric Value
0     Accuracy  0.975000      Accuracy   1.0
1    Precision  0.976389     Precision   1.0
2       Recall  0.975000        Recall   1.0
3     F1 Score  0.975079      F1 Score   1.0

Classification Cross-Validation Metr

# svm

In [22]:
! pip install cvxopt

Collecting cvxopt
  Downloading cvxopt-1.3.2-cp39-cp39-win_amd64.whl.metadata (1.4 kB)
Downloading cvxopt-1.3.2-cp39-cp39-win_amd64.whl (12.8 MB)
   ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
   ---- ----------------------------------- 1.3/12.8 MB 16.6 MB/s eta 0:00:01
   ------ --------------------------------- 2.1/12.8 MB 23.5 MB/s eta 0:00:01
   --------------------------- ------------ 8.9/12.8 MB 18.4 MB/s eta 0:00:01
   ---------------------------------------- 12.8/12.8 MB 21.2 MB/s eta 0:00:00
Installing collected packages: cvxopt
Successfully installed cvxopt-1.3.2


In [None]:
import numpy as np
from tqdm import tqdm
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, mean_absolute_error, r2_score
)
from cvxopt import matrix, solvers
import pandas as pd
import matplotlib.pyplot as plt


def plot_clf(X, y, cls):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))
    points = np.c_[xx.ravel(), yy.ravel()]
    Z = cls.predict(points).reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k", marker='o')
    plt.show()


class SVM:
    def __init__(self, C=1, sigma=1, epsilon=0.1, kernel="linear", task="classification", use_smo=True, max_iter=1000):
        assert kernel in ["linear", "gaussian"], "Unsupported kernel type!"
        assert task in ["classification", "regression"], "Unsupported task type!"
        self.C = C
        self.sigma = sigma
        self.epsilon = epsilon
        self.kernel_type = kernel
        self.task = task
        self.use_smo = use_smo  
        self.max_iter = max_iter  
        self.b = 0
        self.alpha = None

        # 核函数选择
        if kernel == "linear":
            self.kernel = lambda x, z: np.dot(x, z)
        elif kernel == "gaussian":
            self.kernel = lambda x, z: np.exp(-0.5 * np.linalg.norm(x - z) ** 2 / (sigma ** 2))

    def fit(self, X, y):
        self.X = X
        self.y = y
        n_samples, n_features = X.shape

        self.alpha = np.zeros(n_samples)
        self.K = np.array([[self.kernel(X[i], X[j]) for j in range(n_samples)] for i in range(n_samples)])
        self.K = np.clip(self.K, -1e5, 1e5)  
        if self.use_smo:
            max_iter = 500  
            for iter_ in range(max_iter):
                alpha_prev = np.copy(self.alpha)
                for i in range(n_samples):
                    for j in range(i + 1, n_samples):  
                        self.update(i, j)
                norm_diff = np.linalg.norm(self.alpha - alpha_prev)
                if iter_ % 10 == 0:
                    print(f"Iteration {iter_ + 1}, ||alpha - alpha_prev|| = {norm_diff:.5f}")
                if norm_diff < 1e-5:
                    print(f"Converged after {iter_ + 1} iterations.")
                    break
            else:
                print("SMO optimization did not converge within the maximum iterations.")
        else:
            P = matrix(self.K * np.outer(y, y))
            q = matrix(-np.ones(n_samples))
            G = matrix(np.vstack((-np.eye(n_samples), np.eye(n_samples))))
            h = matrix(np.hstack((np.zeros(n_samples), np.ones(n_samples) * self.C)))
            A = matrix(y.reshape(1, -1).astype("double"))
            b = matrix(np.zeros(1))

            sol = solvers.qp(P, q, G, h, A, b)
            self.alpha = np.ravel(sol["x"])

        support_vector_idx = np.where((self.alpha > 1e-4) & (self.alpha < self.C))[0]
        if len(support_vector_idx) > 0:
            self.b = np.mean([
                y[i] - np.clip(np.sum(self.alpha * y * self.K[i]), -1e5, 1e5)
                for i in support_vector_idx
            ])
        else:
            print("Warning: No support vectors found. The model might not be properly trained.")
            self.b = 0


    def update(self, i, j):
        if i == j:
            return

        a_i, a_j = self.alpha[i], self.alpha[j]

        if self.task == "classification":
            L = max(0, a_j - a_i) if self.y[i] != self.y[j] else max(0, a_i + a_j - self.C)
            H = min(self.C, self.C + a_j - a_i) if self.y[i] != self.y[j] else min(self.C, a_i + a_j)
        elif self.task == "regression":
            L = max(0, a_j - self.epsilon)
            H = min(self.C, a_j + self.epsilon)

        eta = self.K[i, i] + self.K[j, j] - 2 * self.K[i, j]
        if eta <= 1e-10:  
            return

        a_j_unc = a_j + self.y[j] * (self._E(i) - self._E(j)) / eta
        a_j_new = np.clip(a_j_unc, L, H) 

        a_i_new = a_i + self.y[i] * self.y[j] * (a_j - a_j_new)

        self.alpha[i] = a_i_new
        self.alpha[j] = a_j_new

        b_i_new = self.y[i] - np.clip(np.sum(self.alpha * self.y * self.K[i]), -1e5, 1e5)
        b_j_new = self.y[j] - np.clip(np.sum(self.alpha * self.y * self.K[j]), -1e5, 1e5)
        self.b = (b_i_new + b_j_new) / 2

    def _E(self, i):
        return self._g(i) - self.y[i]

    def _g(self, i):
        return np.clip(np.sum(self.alpha * self.y * self.K[:, i]) + self.b, -1e5, 1e5)

    def predict(self, X):
        preds = []
        for x in tqdm(X):
            pred = np.sum(
                self.alpha * self.y * np.array([self.kernel(x, self.X[j]) for j in range(len(self.X))])
            ) + self.b
            if self.task == "classification":
                preds.append(1 if pred >= 0 else -1)
            elif self.task == "regression":
                preds.append(np.clip(pred, -1e5, 1e5))
        return np.array(preds)

    def score(self, X, y):
        preds = self.predict(X)
        if self.task == "classification":
            return np.mean(preds == y)
        elif self.task == "regression":
            return r2_score(y, preds)

def evaluate_model(model, X, y, classifier=True):
    y_pred = model.predict(X)

    if classifier:  
        metrics = {
            "Accuracy": accuracy_score(y, y_pred),
            "Precision": precision_score(y, y_pred, average="weighted"),
            "Recall": recall_score(y, y_pred, average="weighted"),
            "F1 Score": f1_score(y, y_pred, average="weighted"),
        }
    else: 
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        adj_r2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - X.shape[1] - 1)
        metrics = {
            "MAE": mean_absolute_error(y, y_pred),
            "MSE": mse,
            "RMSE": np.sqrt(mse),
            "R2 Square": r2,
            "Adj R Square": adj_r2,
        }

    return pd.DataFrame(metrics.items(), columns=["Metric", "Value"])

def cross_validate(model, X, y, cv="kfold", n_splits=5, classifier=True):
    if cv == "kfold":
        cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "stratified":
        if not classifier:
            raise ValueError("Stratified K-Fold only works for classification tasks.")
        cv_splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "loo":
        cv_splitter = LeaveOneOut()
    else:
        raise ValueError("Invalid cv type. Choose from 'kfold', 'stratified', 'loo'.")

    metrics_list = []
    for train_idx, val_idx in cv_splitter.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_val, y_val, classifier=classifier)
        metrics_list.append(metrics.set_index("Metric")["Value"])

    avg_metrics = pd.DataFrame(metrics_list).mean(axis=0)
    return avg_metrics


In [None]:
if __name__ == "__main__":
    np.random.seed(42)
    X_class = np.random.randn(200, 2)
    y_class = (X_class[:, 0] + X_class[:, 1] > 0).astype(int) * 2 - 1  

    X_train, X_temp, y_train, y_temp = train_test_split(X_class, y_class, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    custom_svm_smo = SVM(C=1, kernel="linear", task="classification", use_smo=True)
    custom_svm_smo.fit(X_train, y_train)

    custom_svm_no_smo = SVM(C=1, kernel="linear", task="classification", use_smo=False)
    custom_svm_no_smo.fit(X_train, y_train)

    sklearn_svm = SVC(C=1, kernel="linear", random_state=42)
    sklearn_svm.fit(X_train, y_train)

    custom_train_metrics_smo = evaluate_model(custom_svm_smo, X_train, y_train, classifier=True)
    custom_val_metrics_smo = evaluate_model(custom_svm_smo, X_val, y_val, classifier=True)
    custom_test_metrics_smo = evaluate_model(custom_svm_smo, X_test, y_test, classifier=True)

    custom_train_metrics_no_smo = evaluate_model(custom_svm_no_smo, X_train, y_train, classifier=True)
    custom_val_metrics_no_smo = evaluate_model(custom_svm_no_smo, X_val, y_val, classifier=True)
    custom_test_metrics_no_smo = evaluate_model(custom_svm_no_smo, X_test, y_test, classifier=True)

    sklearn_train_metrics = evaluate_model(sklearn_svm, X_train, y_train, classifier=True)
    sklearn_val_metrics = evaluate_model(sklearn_svm, X_val, y_val, classifier=True)
    sklearn_test_metrics = evaluate_model(sklearn_svm, X_test, y_test, classifier=True)

    print("Classification Train Metrics Comparison:")
    print(pd.concat(
        [custom_train_metrics_smo, custom_train_metrics_no_smo, sklearn_train_metrics],
        axis=1,
        keys=["Custom SVM (SMO)", "Custom SVM (No SMO)", "Sklearn SVM"]
    ))

    print("\nClassification Validation Metrics Comparison:")
    print(pd.concat(
        [custom_val_metrics_smo, custom_val_metrics_no_smo, sklearn_val_metrics],
        axis=1,
        keys=["Custom SVM (SMO)", "Custom SVM (No SMO)", "Sklearn SVM"]
    ))

    print("\nClassification Test Metrics Comparison:")
    print(pd.concat(
        [custom_test_metrics_smo, custom_test_metrics_no_smo, sklearn_test_metrics],
        axis=1,
        keys=["Custom SVM (SMO)", "Custom SVM (No SMO)", "Sklearn SVM"]
    ))

    custom_cv_metrics_smo = cross_validate(custom_svm_smo, X_class, y_class, cv="stratified", n_splits=5, classifier=True)
    custom_cv_metrics_no_smo = cross_validate(custom_svm_no_smo, X_class, y_class, cv="stratified", n_splits=5, classifier=True)
    sklearn_cv_metrics = cross_validate(sklearn_svm, X_class, y_class, cv="stratified", n_splits=5, classifier=True)

    print("\nClassification Cross-Validation Metrics Comparison:")
    print(pd.concat(
        [custom_cv_metrics_smo, custom_cv_metrics_no_smo, sklearn_cv_metrics],
        axis=1,
        keys=["Custom SVM (SMO)", "Custom SVM (No SMO)", "Sklearn SVM"]
    ))


Converged after 34 iterations.
     pcost       dcost       gap    pres   dres
 0: -4.1390e+01 -3.5996e+02  2e+03  4e+00  2e-15
 1: -2.3993e+01 -2.4848e+02  4e+02  5e-01  2e-15
 2: -9.9620e+00 -6.3950e+01  7e+01  4e-02  1e-14
 3: -1.2894e+01 -2.3000e+01  1e+01  6e-03  3e-15
 4: -1.5814e+01 -1.8478e+01  3e+00  1e-03  1e-15
 5: -1.6600e+01 -1.7261e+01  7e-01  3e-04  1e-15
 6: -1.6823e+01 -1.6985e+01  2e-01  5e-05  1e-15
 7: -1.6884e+01 -1.6909e+01  2e-02  5e-07  1e-15
 8: -1.6896e+01 -1.6897e+01  2e-03  2e-08  2e-15
 9: -1.6896e+01 -1.6896e+01  2e-05  2e-10  2e-15
10: -1.6896e+01 -1.6896e+01  2e-07  2e-12  1e-15
Optimal solution found.


100%|██████████| 120/120 [00:00<00:00, 12620.14it/s]
100%|██████████| 40/40 [00:00<00:00, 9879.41it/s]
100%|██████████| 40/40 [00:00<00:00, 10106.76it/s]
100%|██████████| 120/120 [00:00<00:00, 14026.21it/s]
100%|██████████| 40/40 [00:00<00:00, 13366.17it/s]
100%|██████████| 40/40 [00:00<00:00, 7832.13it/s]

Classification Train Metrics Comparison:
  Custom SVM (SMO)       Custom SVM (No SMO)       Sklearn SVM      
            Metric Value              Metric Value      Metric Value
0         Accuracy   1.0            Accuracy   1.0    Accuracy   1.0
1        Precision   1.0           Precision   1.0   Precision   1.0
2           Recall   1.0              Recall   1.0      Recall   1.0
3         F1 Score   1.0            F1 Score   1.0    F1 Score   1.0

Classification Validation Metrics Comparison:
  Custom SVM (SMO)       Custom SVM (No SMO)       Sklearn SVM      
            Metric Value              Metric Value      Metric Value
0         Accuracy   1.0            Accuracy   1.0    Accuracy   1.0
1        Precision   1.0           Precision   1.0   Precision   1.0
2           Recall   1.0              Recall   1.0      Recall   1.0
3         F1 Score   1.0            F1 Score   1.0    F1 Score   1.0

Classification Test Metrics Comparison:
  Custom SVM (SMO)       Custom SVM (No SMO




Converged after 55 iterations.


100%|██████████| 40/40 [00:00<?, ?it/s]


Converged after 27 iterations.


100%|██████████| 40/40 [00:00<?, ?it/s]


Converged after 10 iterations.


100%|██████████| 40/40 [00:00<00:00, 7788.87it/s]


Converged after 16 iterations.


100%|██████████| 40/40 [00:00<?, ?it/s]


Converged after 22 iterations.


100%|██████████| 40/40 [00:00<00:00, 3154.26it/s]


     pcost       dcost       gap    pres   dres
 0: -5.4595e+01 -4.4957e+02  3e+03  3e+00  4e-15
 1: -3.2549e+01 -3.0316e+02  5e+02  5e-01  3e-15
 2: -1.4558e+01 -8.0704e+01  9e+01  5e-02  6e-15
 3: -1.6280e+01 -2.8960e+01  1e+01  8e-03  2e-15
 4: -1.9605e+01 -2.2834e+01  4e+00  1e-03  1e-15
 5: -2.0559e+01 -2.1366e+01  8e-01  1e-04  2e-15
 6: -2.0732e+01 -2.1144e+01  4e-01  2e-05  1e-15
 7: -2.0914e+01 -2.0928e+01  1e-02  7e-07  2e-15
 8: -2.0920e+01 -2.0922e+01  3e-03  9e-08  2e-15
 9: -2.0921e+01 -2.0921e+01  3e-05  1e-09  2e-15
10: -2.0921e+01 -2.0921e+01  3e-07  1e-11  2e-15
Optimal solution found.


100%|██████████| 40/40 [00:00<?, ?it/s]


     pcost       dcost       gap    pres   dres
 0: -5.6588e+01 -4.8320e+02  3e+03  4e+00  4e-15
 1: -3.3646e+01 -3.3469e+02  6e+02  5e-01  3e-15
 2: -1.5434e+01 -9.5720e+01  1e+02  6e-02  8e-15
 3: -1.6467e+01 -3.0471e+01  2e+01  8e-03  3e-15
 4: -2.0051e+01 -2.4332e+01  5e+00  2e-03  1e-15
 5: -2.0895e+01 -2.3035e+01  2e+00  8e-04  1e-15
 6: -2.1568e+01 -2.2039e+01  5e-01  1e-04  1e-15
 7: -2.1735e+01 -2.1807e+01  8e-02  1e-05  2e-15
 8: -2.1766e+01 -2.1770e+01  4e-03  2e-07  1e-15
 9: -2.1768e+01 -2.1768e+01  4e-05  2e-09  1e-15
10: -2.1768e+01 -2.1768e+01  4e-07  2e-11  2e-15
Optimal solution found.


100%|██████████| 40/40 [00:00<?, ?it/s]

     pcost       dcost       gap    pres   dres
 0: -5.5817e+01 -4.7370e+02  3e+03  4e+00  5e-15
 1: -3.2566e+01 -3.2497e+02  6e+02  5e-01  3e-15
 2: -1.4628e+01 -9.1021e+01  1e+02  6e-02  5e-15
 3: -1.4915e+01 -2.8513e+01  2e+01  8e-03  2e-15
 4: -1.8227e+01 -2.1840e+01  4e+00  2e-03  1e-15
 5: -1.9121e+01 -2.0494e+01  2e+00  6e-04  1e-15
 6: -1.9500e+01 -1.9975e+01  5e-01  2e-04  1e-15
 7: -1.9669e+01 -1.9742e+01  8e-02  1e-05  1e-15





 8: -1.9702e+01 -1.9704e+01  1e-03  2e-07  1e-15
 9: -1.9703e+01 -1.9703e+01  1e-05  2e-09  2e-15
Optimal solution found.


100%|██████████| 40/40 [00:00<?, ?it/s]

     pcost       dcost       gap    pres   dres





 0: -5.9853e+01 -4.9033e+02  3e+03  4e+00  4e-15
 1: -3.5136e+01 -3.4304e+02  6e+02  5e-01  3e-15
 2: -1.5090e+01 -1.0523e+02  1e+02  7e-02  6e-15
 3: -1.5423e+01 -3.2510e+01  2e+01  1e-02  3e-15
 4: -1.9168e+01 -2.4903e+01  7e+00  3e-03  2e-15
 5: -2.0556e+01 -2.2563e+01  2e+00  8e-04  1e-15
 6: -2.1015e+01 -2.1869e+01  9e-01  3e-04  1e-15
 7: -2.1259e+01 -2.1502e+01  3e-01  7e-05  1e-15
 8: -2.1357e+01 -2.1369e+01  1e-02  2e-06  2e-15
 9: -2.1362e+01 -2.1363e+01  1e-04  2e-08  2e-15
10: -2.1362e+01 -2.1362e+01  1e-06  2e-10  2e-15
Optimal solution found.


100%|██████████| 40/40 [00:00<?, ?it/s]

     pcost       dcost       gap    pres   dres





 0: -5.4818e+01 -4.8015e+02  3e+03  4e+00  4e-15
 1: -3.1900e+01 -3.3120e+02  6e+02  5e-01  3e-15
 2: -1.3807e+01 -9.7501e+01  1e+02  7e-02  6e-15
 3: -1.3406e+01 -2.9663e+01  2e+01  1e-02  2e-15
 4: -1.6683e+01 -2.1728e+01  6e+00  3e-03  1e-15
 5: -1.7924e+01 -1.9585e+01  2e+00  6e-04  1e-15
 6: -1.8292e+01 -1.8956e+01  7e-01  2e-04  1e-15
 7: -1.8513e+01 -1.8660e+01  2e-01  4e-05  1e-15
 8: -1.8570e+01 -1.8582e+01  1e-02  7e-16  1e-15
 9: -1.8576e+01 -1.8576e+01  2e-04  7e-16  2e-15
10: -1.8576e+01 -1.8576e+01  2e-06  1e-16  2e-15
Optimal solution found.


100%|██████████| 40/40 [00:00<00:00, 9834.24it/s]



Classification Cross-Validation Metrics Comparison:
           Custom SVM (SMO)  Custom SVM (No SMO)  Sklearn SVM
Metric                                                       
Accuracy           0.995000                  1.0     0.995000
Precision          0.995238                  1.0     0.995238
Recall             0.995000                  1.0     0.995000
F1 Score           0.994997                  1.0     0.994997


  a_j_unc = a_j + self.y[j] * (self._E(i) - self._E(j)) / eta
  b_i_new = self.y[i] - np.sum(self.alpha * self.y * self.K[i])
  b_j_new = self.y[j] - np.sum(self.alpha * self.y * self.K[j])
  return np.sum(self.alpha * self.y * self.K[:, i]) + self.b
  return np.sum(self.alpha * self.y * self.K[:, i]) + self.b


KeyboardInterrupt: 

In [None]:
if __name__ == "__main__":
    X_reg = np.random.rand(200, 2) * 10
    y_reg = 3 * X_reg[:, 0] + 2 * X_reg[:, 1] + np.random.randn(200)

    from sklearn.preprocessing import StandardScaler
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_reg = scaler_X.fit_transform(X_reg)
    y_reg = scaler_y.fit_transform(y_reg.reshape(-1, 1)).ravel()

    X_train, X_temp, y_train, y_temp = train_test_split(X_reg, y_reg, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    custom_svr_smo = SVM(C=1, kernel="linear", task="regression", use_smo=True, max_iter=500)
    custom_svr_smo.fit(X_train, y_train)

    custom_svr_no_smo = SVM(C=1, kernel="linear", task="regression", use_smo=False)
    custom_svr_no_smo.fit(X_train, y_train)

    sklearn_svr = SVR(C=1, kernel="linear")
    sklearn_svr.fit(X_train, y_train)

    custom_train_metrics_smo = evaluate_model(custom_svr_smo, X_train, y_train, classifier=False)
    custom_val_metrics_smo = evaluate_model(custom_svr_smo, X_val, y_val, classifier=False)
    custom_test_metrics_smo = evaluate_model(custom_svr_smo, X_test, y_test, classifier=False)

    custom_train_metrics_no_smo = evaluate_model(custom_svr_no_smo, X_train, y_train, classifier=False)
    custom_val_metrics_no_smo = evaluate_model(custom_svr_no_smo, X_val, y_val, classifier=False)
    custom_test_metrics_no_smo = evaluate_model(custom_svr_no_smo, X_test, y_test, classifier=False)

    sklearn_train_metrics = evaluate_model(sklearn_svr, X_train, y_train, classifier=False)
    sklearn_val_metrics = evaluate_model(sklearn_svr, X_val, y_val, classifier=False)
    sklearn_test_metrics = evaluate_model(sklearn_svr, X_test, y_test, classifier=False)

    print("\nRegression Train Metrics Comparison:")
    print(pd.concat(
        [custom_train_metrics_smo, custom_train_metrics_no_smo, sklearn_train_metrics],
        axis=1,
        keys=["Custom SVR (SMO)", "Custom SVR (No SMO)", "Sklearn SVR"]
    ))

    print("\nRegression Validation Metrics Comparison:")
    print(pd.concat(
        [custom_val_metrics_smo, custom_val_metrics_no_smo, sklearn_val_metrics],
        axis=1,
        keys=["Custom SVR (SMO)", "Custom SVR (No SMO)", "Sklearn SVR"]
    ))

    print("\nRegression Test Metrics Comparison:")
    print(pd.concat(
        [custom_test_metrics_smo, custom_test_metrics_no_smo, sklearn_test_metrics],
        axis=1,
        keys=["Custom SVR (SMO)", "Custom SVR (No SMO)", "Sklearn SVR"]
    ))

    custom_cv_metrics_smo = cross_validate(custom_svr_smo, X_reg, y_reg, cv="kfold", n_splits=5, classifier=False)
    custom_cv_metrics_no_smo = cross_validate(custom_svr_no_smo, X_reg, y_reg, cv="kfold", n_splits=5, classifier=False)
    sklearn_cv_metrics = cross_validate(sklearn_svr, X_reg, y_reg, cv="kfold", n_splits=5, classifier=False)

    print("\nRegression Cross-Validation Metrics Comparison:")
    print(pd.concat(
        [custom_cv_metrics_smo, custom_cv_metrics_no_smo, sklearn_cv_metrics],
        axis=1,
        keys=["Custom SVR (SMO)", "Custom SVR (No SMO)", "Sklearn SVR"]
    ))


Iteration 1, ||alpha - alpha_prev|| = 9.37166
Iteration 11, ||alpha - alpha_prev|| = 2.26468
Iteration 21, ||alpha - alpha_prev|| = 3.05084
Iteration 31, ||alpha - alpha_prev|| = 3.87264
Iteration 41, ||alpha - alpha_prev|| = 5.08330
Iteration 51, ||alpha - alpha_prev|| = 3.91108
Iteration 61, ||alpha - alpha_prev|| = 5.38587
Iteration 71, ||alpha - alpha_prev|| = 5.58920
Iteration 81, ||alpha - alpha_prev|| = 4.70576
Iteration 91, ||alpha - alpha_prev|| = 5.31358
Iteration 101, ||alpha - alpha_prev|| = 4.81138
Iteration 111, ||alpha - alpha_prev|| = 5.89473
Iteration 121, ||alpha - alpha_prev|| = 5.20858
Iteration 131, ||alpha - alpha_prev|| = 4.73818
Iteration 141, ||alpha - alpha_prev|| = 4.21163
Iteration 151, ||alpha - alpha_prev|| = 4.56944
Iteration 161, ||alpha - alpha_prev|| = 5.71511
Iteration 171, ||alpha - alpha_prev|| = 6.15716
Iteration 181, ||alpha - alpha_prev|| = 7.49054
Iteration 191, ||alpha - alpha_prev|| = 5.32540
Iteration 201, ||alpha - alpha_prev|| = 6.69030
Ite

100%|██████████| 120/120 [00:00<00:00, 10152.01it/s]
100%|██████████| 40/40 [00:00<00:00, 7926.49it/s]
100%|██████████| 40/40 [00:00<00:00, 7836.52it/s]
100%|██████████| 120/120 [00:00<00:00, 7626.12it/s]
100%|██████████| 40/40 [00:00<00:00, 7025.05it/s]
100%|██████████| 40/40 [00:00<00:00, 9194.51it/s]


Regression Train Metrics Comparison:
  Custom SVR (SMO)           Custom SVR (No SMO)             Sklearn SVR  \
            Metric     Value              Metric     Value        Metric   
0              MAE  0.071538                 MAE  2.482207           MAE   
1              MSE  0.008375                 MSE  8.690162           MSE   
2             RMSE  0.091514                RMSE  2.947908          RMSE   
3        R2 Square  0.991286           R2 Square -8.042367     R2 Square   
4     Adj R Square  0.991137        Adj R Square -8.196937  Adj R Square   

             
      Value  
0  0.072590  
1  0.007969  
2  0.089271  
3  0.991708  
4  0.991566  

Regression Validation Metrics Comparison:
  Custom SVR (SMO)           Custom SVR (No SMO)             Sklearn SVR  \
            Metric     Value              Metric     Value        Metric   
0              MAE  0.085649                 MAE  2.792963           MAE   
1              MSE  0.011859                 MSE  9.958371  




Iteration 1, ||alpha - alpha_prev|| = 14.04790
Iteration 11, ||alpha - alpha_prev|| = 4.11689
Iteration 21, ||alpha - alpha_prev|| = 4.11936
Iteration 31, ||alpha - alpha_prev|| = 4.14979
Iteration 41, ||alpha - alpha_prev|| = 4.09225
Iteration 51, ||alpha - alpha_prev|| = 3.83478
Iteration 61, ||alpha - alpha_prev|| = 6.30479
Iteration 71, ||alpha - alpha_prev|| = 4.33141
Iteration 81, ||alpha - alpha_prev|| = 3.68549
Iteration 91, ||alpha - alpha_prev|| = 4.29734
Iteration 101, ||alpha - alpha_prev|| = 7.05691
Iteration 111, ||alpha - alpha_prev|| = 7.04227
Iteration 121, ||alpha - alpha_prev|| = 3.68784
Iteration 131, ||alpha - alpha_prev|| = 4.84784
Iteration 141, ||alpha - alpha_prev|| = 3.89674
Iteration 151, ||alpha - alpha_prev|| = 4.14617
Iteration 161, ||alpha - alpha_prev|| = 3.59313
Iteration 171, ||alpha - alpha_prev|| = 3.72650
Iteration 181, ||alpha - alpha_prev|| = 3.56293
Iteration 191, ||alpha - alpha_prev|| = 4.21788
Iteration 201, ||alpha - alpha_prev|| = 4.99892
It

100%|██████████| 40/40 [00:00<00:00, 11402.99it/s]


Iteration 1, ||alpha - alpha_prev|| = 14.38552
Iteration 11, ||alpha - alpha_prev|| = 6.49771
Iteration 21, ||alpha - alpha_prev|| = 6.72689
Iteration 31, ||alpha - alpha_prev|| = 5.02412
Iteration 41, ||alpha - alpha_prev|| = 7.01209
Iteration 51, ||alpha - alpha_prev|| = 11.74594
Iteration 61, ||alpha - alpha_prev|| = 5.86990
Iteration 71, ||alpha - alpha_prev|| = 9.56326
Iteration 81, ||alpha - alpha_prev|| = 5.41636
Iteration 91, ||alpha - alpha_prev|| = 7.02411
Iteration 101, ||alpha - alpha_prev|| = 4.94376
Iteration 111, ||alpha - alpha_prev|| = 5.00339
Iteration 121, ||alpha - alpha_prev|| = 4.71899
Iteration 131, ||alpha - alpha_prev|| = 6.95027
Iteration 141, ||alpha - alpha_prev|| = 4.87588
Iteration 151, ||alpha - alpha_prev|| = 7.31635
Iteration 161, ||alpha - alpha_prev|| = 5.68485
Iteration 171, ||alpha - alpha_prev|| = 6.19402
Iteration 181, ||alpha - alpha_prev|| = 5.49277
Iteration 191, ||alpha - alpha_prev|| = 4.49468
Iteration 201, ||alpha - alpha_prev|| = 9.43565
I

100%|██████████| 40/40 [00:00<00:00, 7775.87it/s]


Iteration 1, ||alpha - alpha_prev|| = 14.05749
Iteration 11, ||alpha - alpha_prev|| = 6.92034
Iteration 21, ||alpha - alpha_prev|| = 5.40077
Iteration 31, ||alpha - alpha_prev|| = 5.07370
Iteration 41, ||alpha - alpha_prev|| = 4.33270
Iteration 51, ||alpha - alpha_prev|| = 5.36851
Iteration 61, ||alpha - alpha_prev|| = 4.78959
Iteration 71, ||alpha - alpha_prev|| = 5.51178
Iteration 81, ||alpha - alpha_prev|| = 7.82686
Iteration 91, ||alpha - alpha_prev|| = 10.00042
Iteration 101, ||alpha - alpha_prev|| = 10.12495
Iteration 111, ||alpha - alpha_prev|| = 11.17680
Iteration 121, ||alpha - alpha_prev|| = 8.85500
Iteration 131, ||alpha - alpha_prev|| = 8.50228
Iteration 141, ||alpha - alpha_prev|| = 9.28632
Iteration 151, ||alpha - alpha_prev|| = 9.20441
Iteration 161, ||alpha - alpha_prev|| = 11.40853
Iteration 171, ||alpha - alpha_prev|| = 10.81222
Iteration 181, ||alpha - alpha_prev|| = 10.09366
Iteration 191, ||alpha - alpha_prev|| = 10.55535
Iteration 201, ||alpha - alpha_prev|| = 9.6

100%|██████████| 40/40 [00:00<00:00, 5916.22it/s]


Iteration 1, ||alpha - alpha_prev|| = 13.62204
Iteration 11, ||alpha - alpha_prev|| = 6.10645
Iteration 21, ||alpha - alpha_prev|| = 6.36909
Iteration 31, ||alpha - alpha_prev|| = 6.10199
Iteration 41, ||alpha - alpha_prev|| = 5.06759


KeyboardInterrupt: 

## adaboost

In [None]:
from sklearn.datasets import load_breast_cancer, make_regression
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
)
import numpy as np
from tqdm import tqdm
import pandas as pd


from sklearn.metrics import accuracy_score, r2_score
import numpy as np
from tqdm import tqdm


class MyAdaboost:
    def __init__(self, n_estimators=50, task="classification"):
        assert task in ["classification", "regression"], "Invalid task type! Choose 'classification' or 'regression'."
        self.n_estimators = n_estimators
        self.task = task
        self.clfs = []
        self.alphas = []
        self.weights = None

    def _G(self, fi, fv, direct):
        assert direct in ["positive", "negative"], "Direction must be 'positive' or 'negative'."

        def _g(X):
            if direct == "positive":
                return np.where(X[:, fi] <= fv, -1, 1)
            else:
                return np.where(X[:, fi] > fv, -1, 1)

        return _g

    def _best_split(self, X, y, w):
        best_err = float('inf')
        best_fi, best_fv, best_direct = None, None, None

        for fi in range(X.shape[1]):
            series = X[:, fi]
            unique_values = np.unique(series)

            if len(unique_values) == 1:  
                continue

            for fv in unique_values:
                predict = np.where(series <= fv, -1, 1)
                err = np.sum(w * (predict != y))

                if err < best_err:
                    best_err, best_fi, best_fv, best_direct = err, fi, fv, "positive"

                predict = -predict
                err = np.sum(w * (predict != y))

                if err < best_err:
                    best_err, best_fi, best_fv, best_direct = err, fi, fv, "negative"

        if best_fv is None:
            raise ValueError("Failed to find a valid split point.")

        return best_err, best_fi, best_fv, best_direct

    def fit(self, X_train, y_train):
        n_samples = len(y_train)
        self.weights = np.ones(n_samples) / n_samples  
        self.alphas = []  
        self.clfs = []   

        for _ in tqdm(range(self.n_estimators), desc="Training AdaBoost"):
            err, fi, fv, direct = self._best_split(X_train, y_train, self.weights)

            if err == 0:
                alpha = 1  
            else:
                alpha = 0.5 * np.log((1 - err) / max(err, 1e-10))  

            self.alphas.append(alpha)
            self.clfs.append(self._G(fi, fv, direct)) 

            predictions = self.clfs[-1](X_train)
            if self.task == "classification":
                self.weights *= np.exp(-alpha * y_train * predictions)
            elif self.task == "regression":
                residuals = y_train - predictions
                self.weights *= np.exp(-alpha * np.abs(residuals))

            self.weights /= np.sum(self.weights)  

            if err == 0:
                print(f"Converged early at iteration {len(self.alphas)}")
                break

    def predict(self, X_test):
        y_p = np.sum([self.alphas[i] * self.clfs[i](X_test) for i in range(len(self.clfs))], axis=0)

        if self.task == "classification":
            return np.sign(y_p)
        elif self.task == "regression":
            return y_p

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        if self.task == "classification":
            return accuracy_score(y_test, y_pred)
        elif self.task == "regression":
            return r2_score(y_test, y_pred)


def evaluate_model(model, X, y, classifier=True):
    y_pred = model.predict(X)

    if classifier: 
        metrics = {
            "Accuracy": accuracy_score(y, y_pred),
            "Precision": precision_score(y, y_pred, average="weighted"),
            "Recall": recall_score(y, y_pred, average="weighted"),
            "F1 Score": f1_score(y, y_pred, average="weighted"),
        }
    else: 
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        adj_r2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - X.shape[1] - 1)
        metrics = {
            "MAE": mean_absolute_error(y, y_pred),
            "MSE": mse,
            "RMSE": np.sqrt(mse),
            "R2 Square": r2,
            "Adj R Square": adj_r2,
        }

    return pd.DataFrame(metrics.items(), columns=["Metric", "Value"])


def cross_validate(model, X, y, cv="kfold", n_splits=5, classifier=True):
    if cv == "kfold":
        cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    elif cv == "stratified":
        if not classifier:
            raise ValueError("Stratified K-Fold only works for classification tasks.")
        cv_splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    else:
        raise ValueError("Invalid cv type. Choose from 'kfold' or 'stratified'.")

    metrics_list = []
    for train_idx, val_idx in cv_splitter.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_val, y_val, classifier=classifier)
        metrics_list.append(metrics.set_index("Metric")["Value"])

    avg_metrics = pd.DataFrame(metrics_list).mean(axis=0)
    return avg_metrics




In [None]:
if __name__ == "__main__":
    breast_cancer = load_breast_cancer()
    X_class = breast_cancer.data
    y_class = np.where(breast_cancer.target == 0, -1, 1)

    X_train, X_test, y_train, y_test = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

    custom_clf = MyAdaboost(n_estimators=50, task="classification")
    custom_clf.fit(X_train, y_train)

    sklearn_clf = AdaBoostClassifier(n_estimators=50, random_state=42)
    sklearn_clf.fit(X_train, y_train)

    custom_train_metrics = evaluate_model(custom_clf, X_train, y_train, classifier=True)
    sklearn_train_metrics = evaluate_model(sklearn_clf, X_train, y_train, classifier=True)

    print("Classification Train Metrics Comparison:")
    print(pd.concat([custom_train_metrics, sklearn_train_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

    # X_reg, y_reg = make_regression(n_samples=200, n_features=2, noise=10, random_state=42)
    # X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

    # custom_reg = MyAdaboost(n_estimators=50, task="regression")
    # custom_reg.fit(X_train, y_train)

    # sklearn_reg = AdaBoostRegressor(n_estimators=50, random_state=42)
    # sklearn_reg.fit(X_train, y_train)

    # custom_train_metrics = evaluate_model(custom_reg, X_train, y_train, classifier=False)
    # sklearn_train_metrics = evaluate_model(sklearn_reg, X_train, y_train, classifier=False)

    # print("\nRegression Train Metrics Comparison:")
    # print(pd.concat([custom_train_metrics, sklearn_train_metrics], axis=1, keys=["Custom Model", "Sklearn Model"]))

Training AdaBoost: 100%|██████████| 50/50 [00:05<00:00,  9.56it/s]


Classification Train Metrics Comparison:
  Custom Model       Sklearn Model      
        Metric Value        Metric Value
0     Accuracy   1.0      Accuracy   1.0
1    Precision   1.0     Precision   1.0
2       Recall   1.0        Recall   1.0
3     F1 Score   1.0      F1 Score   1.0
