In [1]:
import numpy as np
from collections import Counter

## Logistic Regression

In [3]:
class LogisticRegression():
    def __init__(self, lr, n_iter):
        if lr <= 0:
            raise ValueError("Learning rate must be positive.")
        if n_iter <= 0 or not isinstance(n_iter, int):
            raise ValueError("Number of iterations must be a positive integer.")
        
        self.lr = lr
        self.n_iter = n_iter
        self.weights = None
        self.bias = None

    def sigmoid(self, r):
        return 1 / (1 + np.exp(-r))

    def fit_model(self, X, y):
        # Check if X and y are numpy arrays
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        if not isinstance(y, np.ndarray):
            raise ValueError("Target variable y must be a numpy array.")
        
        # Check if X and y dimensions match
        if X.shape[0] != y.shape[0]:
            raise ValueError(f"Number of samples in X ({X.shape[0]}) does not match length of y ({y.shape[0]}).")
        
        # Initialize weights and bias
        n_samples, n_features = X.shape
        self.weights = np.random.rand(n_features)
        self.bias = 0

        # Start training iterations
        for _ in range(self.n_iter):
            linear_output = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(linear_output)

            # Compute gradients
            delw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            delb = (1/n_samples) * np.sum(y_pred - y)

            # Update weights and bias
            self.weights -= self.lr * delw
            self.bias -= self.lr * delb

    def predict_class(self, X):
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        if X.shape[1] != self.weights.shape[0]:
            raise ValueError(f"Number of features in X ({X.shape[1]}) does not match the number of features in the model ({self.weights.shape[0]}).")
        
        linear_output = np.dot(X, self.weights) + self.bias
        y_pred = self.sigmoid(linear_output)
        y_pred_class = [1 if i > 0.5 else 0 for i in y_pred]
        return y_pred_class

## Naive Bayes

In [5]:
class NaiveBayes:
    def __init__(self):
        self.class_probs = None  
        self.mean = None 
        self.var = None 
        self.classes = None  

    def fit(self, X, y):
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        if not isinstance(y, np.ndarray):
            raise ValueError("Target variable y must be a numpy array.")
        
        if X.shape[0] != y.shape[0]:
            raise ValueError(f"Number of samples in X ({X.shape[0]}) does not match length of y ({y.shape[0]}).")
        
        self.classes = np.unique(y)
        if len(self.classes) < 2:
            raise ValueError("The target variable y must have at least two classes.")

        n_samples, n_features = X.shape

        self.mean = np.zeros((len(self.classes), n_features))
        self.var = np.zeros((len(self.classes), n_features))
        self.class_probs = np.zeros(len(self.classes))

        for i, label in enumerate(self.classes):
            X_class = X[y == label]
            self.mean[i, :] = X_class.mean(axis=0)
            self.var[i, :] = X_class.var(axis=0)
            self.class_probs[i] = X_class.shape[0] / n_samples

    def predict(self, X):
        # Check if X is a numpy array
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        if X.shape[1] != self.mean.shape[1]:
            raise ValueError(f"Number of features in X ({X.shape[1]}) does not match the number of features in the model ({self.mean.shape[1]}).")

        predictions = [self._predict_sample(sample) for sample in X]
        return np.array(predictions)

    def _predict_sample(self, sample):
        posteriors = []
        for i, label in enumerate(self.classes):
            prior = np.log(self.class_probs[i])
            likelihood = np.sum(np.log(self._pdf(i, sample)))
            posterior = prior + likelihood
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, sample):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        var = np.maximum(var, 1e-9)
        exponent = np.exp(-((sample - mean) ** 2) / (2 * var))
        return (1 / np.sqrt(2 * np.pi * var)) * exponent

## K-Nearest Neighbor(KNN)

In [7]:
class KNN:
    def __init__(self, k=3):
        # Validate k
        if not isinstance(k, int) or k <= 0:
            raise ValueError("k must be a positive integer.")
        self.k = k
        self.X_train = None
        self.y_train = None

    def set_k(self, k):
        """User can dynamically set or update k."""
        if not isinstance(k, int) or k <= 0:
            raise ValueError("k must be a positive integer.")
        self.k = k
        print(f"Updated k value to {self.k}")

    def fit(self, X, y):
        # Check if X and y are numpy arrays
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        if not isinstance(y, np.ndarray):
            raise ValueError("Target variable y must be a numpy array.")
        
        if X.shape[0] != y.shape[0]:
            raise ValueError(f"Number of samples in X ({X.shape[0]}) does not match length of y ({y.shape[0]}).")

        self.X_train = X
        self.y_train = y

    def predict(self, X):
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        
        # Check if X has the same number of features as the training data
        if X.shape[1] != self.X_train.shape[1]:
            raise ValueError(f"Number of features in X ({X.shape[1]}) does not match the number of features in the training data ({self.X_train.shape[1]}).")
        
        predictions = [self._predict_sample(sample) for sample in X]
        return np.array(predictions)

    def _predict_sample(self, sample):
        # Calculate the Euclidean distance between the sample and all training points
        distances = [self._euclidean_distance(sample, train_sample) for train_sample in self.X_train]
        
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

    def _euclidean_distance(self, point1, point2):
        return np.sqrt(np.sum((point1 - point2) ** 2))

## Decision Tree

In [9]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None
    
    def fit(self, X, y):
        """Builds the decision tree based on the training data."""
        self.tree = self._build_tree(X, y)
    
    def _build_tree(self, X, y, depth=0):
        """Recursively builds the decision tree."""
        # Check if stopping conditions are met
        if len(np.unique(y)) == 1:  # Pure node
            return {"label": np.unique(y)[0]}
        
        if len(X) < self.min_samples_split:  # If not enough samples to split further
            return {"label": Counter(y).most_common(1)[0][0]}
        
        if self.max_depth and depth >= self.max_depth:  # Maximum depth reached
            return {"label": Counter(y).most_common(1)[0][0]}
        
        # Find the best split
        best_split = self._find_best_split(X, y)
        
        # Recursively build the left and right branches of the tree
        left_tree = self._build_tree(X[best_split["left_indices"]], y[best_split["left_indices"]], depth + 1)
        right_tree = self._build_tree(X[best_split["right_indices"]], y[best_split["right_indices"]], depth + 1)
        
        return {
            "feature": best_split["feature"],
            "threshold": best_split["threshold"],
            "left": left_tree,
            "right": right_tree
        }
    
    def _find_best_split(self, X, y):
        """Find the best feature and threshold to split the data."""
        best_gini = float("inf")
        best_split = {}
        
        n_samples, n_features = X.shape
        
        # Loop over all features
        for feature in range(n_features):
            # Get unique values of the feature
            possible_thresholds = np.unique(X[:, feature])
            
            for threshold in possible_thresholds:
                # Split the data based on this feature and threshold
                left_indices = np.where(X[:, feature] <= threshold)[0]
                right_indices = np.where(X[:, feature] > threshold)[0]
                
                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue
                
                # Calculate Gini impurity for this split
                gini = self._gini_impurity(y[left_indices], y[right_indices])
                
                if gini < best_gini:
                    best_gini = gini
                    best_split = {
                        "feature": feature,
                        "threshold": threshold,
                        "left_indices": left_indices,
                        "right_indices": right_indices
                    }
        
        return best_split
    
    def _gini_impurity(self, left_y, right_y):
        """Calculate the Gini impurity for a split."""
        n_left = len(left_y)
        n_right = len(right_y)
        total = n_left + n_right
        
        left_gini = 1 - sum((np.sum(left_y == c) / n_left) ** 2 for c in np.unique(left_y))
        right_gini = 1 - sum((np.sum(right_y == c) / n_right) ** 2 for c in np.unique(right_y))
        
        gini = (n_left / total) * left_gini + (n_right / total) * right_gini
        return gini
    
    def predict(self, X):
        """Predict the class labels for the input samples."""
        return np.array([self._predict_sample(x, self.tree) for x in X])
    
    def _predict_sample(self, sample, tree):
        """Recursively traverse the tree to predict the class label for a sample."""
        if "label" in tree:
            return tree["label"]
        
        feature_value = sample[tree["feature"]]
        
        if feature_value <= tree["threshold"]:
            return self._predict_sample(sample, tree["left"])
        else:
            return self._predict_sample(sample, tree["right"])


## Random Forest

In [11]:
class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2):
        if not isinstance(n_estimators, int) or n_estimators <= 0:
            raise ValueError("n_estimators must be a positive integer.")
        
        if max_depth is not None and (not isinstance(max_depth, int) or max_depth <= 0):
            raise ValueError("max_depth must be a positive integer or None.")
        
        if not isinstance(min_samples_split, int) or min_samples_split <= 0:
            raise ValueError("min_samples_split must be a positive integer.")

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        if not isinstance(y, np.ndarray):
            raise ValueError("Target variable y must be a numpy array.")
        
        if X.shape[0] != y.shape[0]:
            raise ValueError(f"Number of samples in X ({X.shape[0]}) does not match length of y ({y.shape[0]}).")
        
        if X.shape[0] == 0 or X.shape[1] == 0:
            raise ValueError("Input data X cannot be empty.")
        for _ in range(self.n_estimators):
            X_sample, y_sample = resample(X, y, n_samples=X.shape[0])
            
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_sample, y_sample)
            
            self.trees.append(tree)

    def predict(self, X):
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        
        predictions = np.array([self._predict_sample(sample) for sample in X])
        return predictions

    def _predict_sample(self, sample):
        tree_predictions = [tree.predict([sample])[0] for tree in self.trees]
        most_common = Counter(tree_predictions).most_common(1)
        return most_common[0][0]


## Support Vector Machine(SVM)

In [13]:
class SVM():
    def __init__(self, lr, n_iter, C):
        if lr <= 0:
            raise ValueError("Learning rate must be positive.")
        if n_iter <= 0 or not isinstance(n_iter, int):
            raise ValueError("Number of iterations must be a positive integer.")
        if C <= 0:
            raise ValueError("Regularization parameter C must be positive.")
        
        self.lr = lr
        self.n_iter = n_iter
        self.C = C
        self.weights = None
        self.bias = None

    def hinge_loss(self, X, y):
        return np.maximum(0, 1 - y * (np.dot(X, self.weights) + self.bias))
    
    def fit(self, X, y):
        # Check if X and y are numpy arrays
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        if not isinstance(y, np.ndarray):
            raise ValueError("Target variable y must be a numpy array.")
        
        # Check if X and y dimensions match
        if X.shape[0] != y.shape[0]:
            raise ValueError(f"Number of samples in X ({X.shape[0]}) does not match length of y ({y.shape[0]}).")
        
        # Ensure that y has values -1 and 1 (SVM convention)
        if not np.all(np.isin(y, [-1, 1])):
            raise ValueError("Target variable y must contain only -1 and 1 values.")
        
        # Initialize weights and bias
        n_samples, n_features = X.shape
        self.weights = np.random.rand(n_features)
        self.bias = 0
        
        # Start training iterations
        for _ in range(self.n_iter):
            for i in range(n_samples):
                if y[i] * (np.dot(X[i], self.weights) + self.bias) < 1:
                    # Update weights and bias if the sample is misclassified
                    self.weights -= self.lr * (2 * self.C * self.weights - np.dot(X[i], y[i]))
                    self.bias -= self.lr * (-y[i])
                else:
                    # Regularization term
                    self.weights -= self.lr * (2 * self.C * self.weights)

    def predict(self, X):
        if not isinstance(X, np.ndarray):
            raise ValueError("Input features X must be a numpy array.")
        if X.shape[1] != self.weights.shape[0]:
            raise ValueError(f"Number of features in X ({X.shape[1]}) does not match the number of features in the model ({self.weights.shape[0]}).")
        
        # Make predictions based on sign of decision boundary
        linear_output = np.dot(X, self.weights) + self.bias
        y_pred = np.sign(linear_output)
        return y_pred