In [1]:
import numpy as np
import pandas as pd
import random
import copy

In [19]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=1000, n_features=14, n_informative=10, noise=15, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [20]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]
df = pd.read_csv('./data/data_banknote_authentication.txt', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [21]:
df = pd.read_csv('./data/data_banknote_authentication.txt', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [22]:
from sklearn.datasets import load_diabetes

data = load_diabetes(as_frame=True)
X, y = data['data'], data['target']

In [23]:
X, y = make_regression(n_samples=150, n_features=14, n_informative=10, noise=15, random_state=42)
X = pd.DataFrame(X).round(2)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [25]:
def C(n, k):
    if 0 <= k <= n:
        nn = 1
        kk = 1
        for t in range(1, min(k, n - k) + 1):
            nn *= n
            kk *= t
            n -= 1
        return nn // kk
    else:
        return 0

In [27]:
from scipy.integrate import quad
import numpy as np

def x(n, m, p):
    return (m - n*p) / np.sqrt(n*p*(1-p))

def F(x):
    return quad(lambda t: np.exp(-t**2/2), 0, x) / np.sqrt(2*np.pi)

In [20]:
class Node:
    def __init__(self):
        self.feature = None
        self.value_split = None
        self.value_leaf = None
        self.side = None
        self.left = None
        self.right = None
        self.idx = None

class MyTreeReg:
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20, bins=None, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 1
        self.bins = bins
        self.__sum_tree_values = 0
        self.split_values = {}
        self.criterion = criterion
        self.fi = {}

    def fit(self, X, y):
        self.tree = None
        self.fi = { col: 0 for col in X.columns }
        
        def create_tree(root, X_root, y_root, side='root', depth=0):
            if root is None:
                root = Node()
            col_name, split_value, ig = self.get_best_split(X_root, y_root)

            mean_value = y_root.mean()

            if depth >= self.max_depth or \
              len(y_root) < self.min_samples_split or \
              (self.leafs_cnt > 1 and self.leafs_cnt >= self.max_leafs):
                root.side = side
                root.value_leaf = mean_value
                root.idx = list(y_root.index)
                self.__sum_tree_values += root.value_leaf
                return root

            self.fi[col_name] += len(y_root) / len(y) * ig

            X_left = X_root.loc[X_root[col_name] <= split_value]
            y_left = y_root.loc[X_root[col_name] <= split_value]

            X_right = X_root.loc[X_root[col_name] > split_value]
            y_right = y_root.loc[X_root[col_name] > split_value]

            if len(X_left) == 0 or len(X_right) == 0:
                root.side = side
                root.value_leaf = mean_value
                root.idx = list(y_root.index)
                self.__sum_tree_values += root.value_leaf
                return root

            root.feature = col_name
            root.value_split = split_value
            self.leafs_cnt += 1

            root.left = create_tree(root.left, X_left, y_left, 'left', depth + 1)
            root.right = create_tree(root.right, X_right, y_right, 'right', depth + 1)

            return root

        self.tree = create_tree(self.tree, X, y)

    def predict(self, X):
        y_pred = []
        for _, row in X.iterrows():
            node = self.tree
            while node.feature is not None:
                if row[node.feature] <= node.value_split:
                    node = node.left
                else:
                    node = node.right
            y_pred.append(node.value_leaf)
        return np.array(y_pred)
    
    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.tree
        if node.feature is not None:
            print(f"{' ' * depth}{node.feature} > {node.value_split}")
            if node.left is not None:
                self.print_tree(node.left, depth + 1)
            if node.right is not None:
                self.print_tree(node.right, depth + 1)
        else:
            print(f"{' ' * depth}{node.side} = {node.value_leaf}")

    def get_best_split(self, X, y):
        mse_0 = self.mse(y)

        col_name = None
        split_value = None
        gain = -float('inf')

        for col in X.columns:
            if not (col in self.split_values.keys()):
                x_unique_values = np.unique(X[col])
                if self.bins is None or len(x_unique_values) - 1 < self.bins:
                    self.split_values[col] = np.array([(x_unique_values[i - 1] + \
                    x_unique_values[i]) / 2 for i in range(1, len(x_unique_values))])
                else:
                    _, self.split_values[col] = np.histogram(X[col], bins=self.bins)

            for split_value_i in self.split_values[col]:
                mask = X[col] <= split_value_i
                left_split, right_split = y[mask], y[~mask]

                mse_left = self.mse(left_split)
                mse_right = self.mse(right_split)

                weight_left = len(left_split) / len(y)
                weight_right = len(right_split) / len(y)

                mse_i = weight_left * mse_left + weight_right * mse_right

                gain_i = mse_0 - mse_i
                if gain < gain_i:
                    col_name = col
                    split_value = split_value_i
                    gain = gain_i

        return col_name, split_value, gain
            
    def mse(self, t):
        t_mean = np.mean(t)
        return np.sum((t - t_mean) ** 2) / (len(t)+1e-15)
    
    def __node_rule(self, p, split=pd.Series()):
        if self.criterion == 'entropy':
            return -np.sum(p * np.log2(p)) if not split.empty else 0
        elif self.criterion == 'gini':
            return 1 - np.sum(p ** 2)

    def __str__(self):
        return f"MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}, bins={self.bins}"
    
    def sum_leafs(self):
        return self.__sum_tree_values
    
    def replace_leafs(self, X, y, loss="MSE"):
        queue = []
        queue.append(self.tree)
        while (queue):
            node = queue.pop(0)
            if (node and node.value_leaf):
                idx = node.idx
                y_pred = self.predict(X.loc[idx, :])
                loss_value = y[idx] - y_pred
                node.value_leaf = loss_value.mean() if loss == 'MSE' else loss_value.median()
            queue.append(node.left)
            queue.append(node.right)



In [21]:
class MyBoostReg:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=5, min_samples_split=2, max_leafs=20, bins=16, loss='MSE', metric=None):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.bins = bins
        self.loss = loss
        self.metric = metric

        self.pred_0 = None
        self.trees = []
        self.best_score = None

    def __repr__(self):
        return f'MyBoostReg class: n_estimators={self.n_estimators}, learning_rate={self.learning_rate}, max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}, bins={self.bins}'

    def calc_score(self, y, y_pred):
        if self.metric == 'MSE':
            return np.sum((y - y_pred)**2) / len(y)
        elif self.metric == 'MAE':
            return np.sum(np.abs((y - y_pred))) / len(y)
        elif self.metric == 'RMSE':
            return np.sqrt(np.sum((y - y_pred)**2) / len(y)) 

    def fit(self, X, y):
        self.pred_0 = y.mean() if self.loss == 'MSE' else y.median()
        Fm = self.pred_0
        for _ in range(self.n_estimators):
            rm = 2*(Fm - y)
            tree = MyTreeReg(self.max_depth, self.min_samples_split, self.max_leafs, self.bins)
            tree.fit(X, rm)
            tree.replace_leafs(X, y, self.loss)
            self.trees.append(tree)
            y_pred = self.learning_rate * tree.predict(X)
            Fm += y_pred

        y_pred = self.predict(X)
        self.best_score = self.metric(y, y_pred)
    
    def predict(self, X):
        y_pred = 0
        for tree in self.trees:
            y_pred += tree.predict(X)
        y_pred = self.learning_rate * y_pred + self.pred_0
        return y_pred






In [33]:
class Node:
    def __init__(self):
        self.feature = None
        self.value_split = None
        self.value_leaf = None
        self.side = None
        self.left = None
        self.right = None
        self.idx = None

class MyTreeReg:
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20, bins=None, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 1
        self.bins = bins
        self.__sum_tree_values = 0
        self.split_values = {}
        self.criterion = criterion
        self.fi = {}

    def fit(self, X, y):
        self.tree = None
        self.fi = { col: 0 for col in X.columns }
        
        def create_tree(root, X_root, y_root, side='root', depth=0):
            if root is None:
                root = Node()
            col_name, split_value, ig = self.get_best_split(X_root, y_root)

            mean_value = y_root.mean()

            if depth >= self.max_depth or \
              len(y_root) < self.min_samples_split or \
              (self.leafs_cnt > 1 and self.leafs_cnt >= self.max_leafs):
                root.side = side
                root.value_leaf = mean_value
                root.idx = list(y_root.index)
                self.__sum_tree_values += root.value_leaf
                return root

            self.fi[col_name] += len(y_root) / len(y) * ig

            X_left = X_root.loc[X_root[col_name] <= split_value]
            y_left = y_root.loc[X_root[col_name] <= split_value]

            X_right = X_root.loc[X_root[col_name] > split_value]
            y_right = y_root.loc[X_root[col_name] > split_value]

            if len(X_left) == 0 or len(X_right) == 0:
                root.side = side
                root.value_leaf = mean_value
                root.idx = list(y_root.index)
                self.__sum_tree_values += root.value_leaf
                return root

            root.feature = col_name
            root.value_split = split_value
            self.leafs_cnt += 1

            root.left = create_tree(root.left, X_left, y_left, 'left', depth + 1)
            root.right = create_tree(root.right, X_right, y_right, 'right', depth + 1)

            return root

        self.tree = create_tree(self.tree, X, y)

    def predict(self, X):
        y_pred = []
        for _, row in X.iterrows():
            node = self.tree
            while node.feature is not None:
                if row[node.feature] <= node.value_split:
                    node = node.left
                else:
                    node = node.right
            y_pred.append(node.value_leaf)
        return np.array(y_pred)
    
    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.tree
        if node.feature is not None:
            print(f"{' ' * depth}{node.feature} > {node.value_split}")
            if node.left is not None:
                self.print_tree(node.left, depth + 1)
            if node.right is not None:
                self.print_tree(node.right, depth + 1)
        else:
            print(f"{' ' * depth}{node.side} = {node.value_leaf}")

    def get_best_split(self, X, y):
        mse_0 = self.mse(y)

        col_name = None
        split_value = None
        gain = -float('inf')

        for col in X.columns:
            if not (col in self.split_values.keys()):
                x_unique_values = np.unique(X[col])
                if self.bins is None or len(x_unique_values) - 1 < self.bins:
                    self.split_values[col] = np.array([(x_unique_values[i - 1] + \
                    x_unique_values[i]) / 2 for i in range(1, len(x_unique_values))])
                else:
                    _, self.split_values[col] = np.histogram(X[col], bins=self.bins)

            for split_value_i in self.split_values[col]:
                mask = X[col] <= split_value_i
                left_split, right_split = y[mask], y[~mask]

                mse_left = self.mse(left_split)
                mse_right = self.mse(right_split)

                weight_left = len(left_split) / len(y)
                weight_right = len(right_split) / len(y)

                mse_i = weight_left * mse_left + weight_right * mse_right

                gain_i = mse_0 - mse_i
                if gain < gain_i:
                    col_name = col
                    split_value = split_value_i
                    gain = gain_i

        return col_name, split_value, gain
            
    def mse(self, t):
        t_mean = np.mean(t)
        return np.sum((t - t_mean) ** 2) / (len(t)+1e-15)
    
    def __node_rule(self, p, split=pd.Series()):
        if self.criterion == 'entropy':
            return -np.sum(p * np.log2(p)) if not split.empty else 0
        elif self.criterion == 'gini':
            return 1 - np.sum(p ** 2)

    def __str__(self):
        return f"MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}, bins={self.bins}"
    
    def sum_leafs(self):
        return self.__sum_tree_values
    
    def replace_leafs(self, X, y, loss="MSE"):
        queue = []
        queue.append(self.tree)
        while (queue):
            node = queue.pop(0)
            if (node and node.value_leaf):
                idx = node.idx
                y_pred = self.predict(X.loc[idx, :])
                loss_value = y[idx] - y_pred
                node.value_leaf = loss_value.mean() if loss == 'MSE' else loss_value.median()
            queue.append(node.left)
            queue.append(node.right)

    def get_leafs_idx(self):
        stack = []
        stack.append(self.tree)
        idxs = []
        while stack:
            node = stack.pop()
            if not node: continue
            if (node.value_leaf):
                idxs.append(node.idx)
            stack.append(node.left)
            stack.append(node.right)
        return idxs


In [42]:
class MyBoostClf:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=5, min_samples_split=2, max_leafs=20, bins=16):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.bins = bins

        self.pred_0 = None
        self.trees = []

    def __repr__(self):
        return f"MyBoostClf class: n_estimators={self.n_estimators}, learning_rate={self.learning_rate}, max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}, bins={self.bins}"

    def __calc_gamma(self, y, p, leaf_idxs):
        gamma = 0
        for idx in leaf_idxs:
            gamma += np.sum(y[idx] - p[idx]) / (np.sum(p[idx]*(1-p[idx])) + 1e-15)
        return gamma


    def fit(self, X, y, verbose=None):
        self.pred_0 = y.mean()
        p = np.array([self.pred_0 for _ in range(len(y))])
        Fm = np.log(p/(1-p + 1e-15))

        for i in range(self.n_estimators):
            tree = MyTreeReg(self.max_depth, self.min_samples_split, self.max_leafs, self.bins)
            p = np.exp(np.log(Fm))/(1+np.exp(np.log(Fm)) + 1e-15)
            r = y - p
            tree.fit(X, r)
            leaf_idxs = tree.get_leafs_idx()
            gamma = self.__calc_gamma(y, p, leaf_idxs)
            self.trees.append(tree)
            gammam = tree.predict(X)
            Fm += self.learning_rate * gammam

            if (verbose and not i % verbose):
                print(i+1, r)
        



In [1]:
class MyKMeans:
    def __init__(self, n_clusters=3, max_iter=10, n_init=3, random_state=42):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.n_init = n_init
        self.random_state = random_state
        self.centroids = []

    def __repr__(self):
        return f"MyKMeans class: n_clusters={self.n_clusters}, max_iter={self.max_iter}, n_init={self.n_init}, random_state={self.random_state}"
    
    def __distance(self, x, y):
        return np.sqrt(np.sum((x - y)**2))

    def __generate_centroids(self, X):
        np.random.seed(seed=self.random_state)
        centroids = []
        for _ in range(self.n_clusters):
            centroid = np.array([np.random.uniform(low=X[feat].min(), high=X[feat].max()) for feat in X.columns])
            centroids.append(centroid)
        return np.array(centroids)
    
    def __distances(self, x, centroids):
        return np.array([self.__distance(x, centroid) for centroid in centroids])

    def __get_new_centroids(self, clusters, X):
        clusters_dict = {cluster: [] for cluster in range(max(clusters)+1)}
        for i in range(len(clusters)):
            clusters_dict[clusters[i]].append(i)
        
        new_centroids = []
        for cluster, rows in clusters_dict.items():
            centroid = np.array([X.loc[rows, feat].mean() for feat in X.columns])
            new_centroids.append(centroid)
        return np.array(new_centroids)


    def _fit(self, X):
        centroids = self.__generate_centroids(X)
        for _ in range(self.max_iter):
            clusters = [self.__distances(x, centroids).argmin() for x in X.values]
            new_centroids = self.__get_new_centroids(clusters, X)
            
            if (new_centroids == centroids).all():
                break
        if np.nan not in new_centroids:
            self.centroids.append(new_centroids)
    
    def __wcss(self, centroids, X):
        return np.sum([self.__distances(x, centroids).min()**2 for x in X.values])
        
    def __best_centroids(self, X):
        wcsss = np.array([self.__wcss(centroids, X) for centroids in self.centroids])
        print(wcsss)
        return self.centroids[wcsss.argmin()], wcsss.min()


    def fit(self, X):
        for _ in range(self.n_init):
            self._fit(X)
        self.cluster_centers_, self.inertia_ = self.__best_centroids(X)

    def predict(self, X):
        clusters = [self.__distances(x, self.cluster_centers_).argmin() for x in X.values]
        return np.array(clusters)



In [32]:
from sklearn.datasets import make_blobs

X, _ = make_blobs(n_samples=100, centers=5, n_features=5, cluster_std=2.5, random_state=42)
X = pd.DataFrame(X)
X.columns = [f'col_{col}' for col in X.columns]

In [5]:
k_means = MyKMeans(**{"n_clusters": 10, "max_iter": 10, "n_init": 3})
k_means.fit(X)
#k_means.cluster_centers_, k_means.inertia_

NameError: name 'MyKMeans' is not defined

In [21]:
class MyAgglomerative:
    def __init__(self, n_clusters=3):
        self.n_clusters = n_clusters

    def __repr__(self):
        return f"MyAgglomerative class: n_clusters={self.n_clusters}"
    
    def __distance(self, x, y):
        return np.sqrt(np.sum((x - y)**2))
    
    def find_min_dist(self, distances):
        min_dist, min_ind = distances[0][0], (0, 0)
        for i in range(len(distances)-1):
            for j in range(i+1, len(distances)):
                if distances[i][j] < min_dist:
                    min_dist = distances[i][j]
                    min_ind = (i, j)
        return min_ind

    def fit_predict(self, X):
        distances = [[self.__distance(X.loc[i, :], X.loc[j, :]) for j in range(i+1, len(X))] for i in range(len(X)-1)]
        min_dist_ind = self.find_min_dist(distances)





In [22]:
my_agg = MyAgglomerative()
my_agg.fit_predict(X)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (99,) + inhomogeneous part.

In [15]:
class MyDBSCAN:
    def __init__(self, eps=3, min_samples=3):
        self.eps = eps
        self.min_samples = min_samples

    def __repr__(self):
        return f"MyDBSCAN class: eps={self.eps}, min_samples={self.min_samples}"
    
    def __distance(self, x, y):
        return np.sqrt(np.sum((x - y)**2))

    def find_neighbours(self, x, X):
        neighbours = []
        for i, y in enumerate(X.values):
            if self.__distance(x, y) <= self.eps:
                neighbours.append(i)
        return neighbours
    
    def fit_predict(self, X):
        num_clusters = 1
        X['cluster'] = -1

        for i in range(len(X)):
            sample = X.loc[i, :]
            if sample['cluster'] != -1:
                continue
            neighbours = self.find_neighbours(sample, X)
            if len(neighbours)-1 < self.min_samples:
                X.loc[i, 'cluster'] = 0
            else:
                X.loc[i, 'cluster'] = num_clusters
                while neighbours:
                    neighbour = neighbours[0]
                    neighbours.pop(0)
                    sample = X.loc[neighbour, :]
                    neighbours_neighbours = self.find_neighbours(sample, X)
                    if sample['cluster'] == 0 or len(neighbours_neighbours)-1 < self.min_samples:
                        X.loc[neighbour, 'cluster'] = num_clusters
                    else:
                        X.loc[neighbour, 'cluster'] = num_clusters
                        neighbours.extend(neighbours_neighbours)
            num_clusters += 1
        return X['cluster'].T



In [17]:
my_dbscan = MyDBSCAN()
my_dbscan.fit_predict(X).unique()

array([ 0, 37], dtype=int64)

In [33]:
class MyPCA:
    def __init__(self, n_components=3):
        self.n_components = n_components

    def __repr__(self):
        return f"MyPCA class: n_components={self.n_components}"

    def fit_transform(self, X):
        X_meaned = X - X.mean()
        cov_mat = X_meaned.cov()
        W_pca = np.linalg.eigh(cov_mat)[1][:, -self.n_components:]
        X_reduced = X_meaned @ W_pca
        return X_reduced

        

In [29]:
np.linalg.eigh(X.cov())[1][:, -3:]

array([[-0.6078959 ,  0.51071017, -0.16382569],
       [-0.17587616,  0.10941046,  0.8434211 ],
       [-0.1271461 , -0.71560981,  0.26282746],
       [-0.76344463, -0.3022094 , -0.09486829],
       [-0.0227224 , -0.35181893, -0.42863006]])

In [34]:
my_pca = MyPCA()
my_pca.fit_transform(X)

Unnamed: 0,0,1,2
0,-0.022279,-1.305308,8.897126
1,-0.892941,-10.837080,-9.350529
2,7.081455,4.182531,9.924896
3,4.353359,0.510684,9.129610
4,2.116619,-0.639272,10.768165
...,...,...,...
95,0.548752,-3.562789,9.934569
96,-4.247022,-11.394326,-9.805331
97,-3.571845,-0.859642,7.572720
98,6.662169,-1.893058,-4.150153


In [41]:
class MyNaiveBayes:

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = y.unique()
        n_classes = len(self._classes)
        self._means = np.zeros((n_classes, n_features))
        self._vars = np.zeros((n_classes, n_features))
        self._priors = np.zeros((n_classes, n_features))

        for idx, c in enumerate(self._classes):
            X_c = X[y==c]
            self._means[idx] = X_c.mean(axis=0)
            self._vars[idx] = X_c.var(axis=0)
            self._priors[idx] = len(X_c) / len(y)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]

    def _predict(self, x):
        posteriors = []
        for idx, c in enumerate(self._classes):
            posterior = np.log(self._priors[idx])
            posterior += np.sum(np.log(self._pdf(idx, x)))
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    def _pdf(self, idx, x):
        return 1/np.sqrt(2*np.pi*self._vars[idx]**2)*np.exp(-(x - self._means[idx])**2 / 2*(self._vars[idx]**2))


In [3]:
class Perceptron:
    def __init__(self, learning_rate=0.1, n_iters=100):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        self.activation_fuction = lambda x: np.where(x > 0, 1, 0)


    def fit(self, X, y):
        n_features, n_samples = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        y_ = np.where(y > 0, 1, 0)

        for _ in range(self.n_iters):
            for idx, x in enumerate(X):
                y_pred = self.activation_fuction(self.weights @ x + self.bias)
                update = self.learning_rate * (y_[idx] - y_pred)
                self.weights += update * x
                self.bias += update


    def predict(self, X):
        y_pred = self.activation_fuction(self.weights @ X + self.bias)
        return y_pred

In [4]:
from sklearn.model_selection import train_test_split
from sklearn import datasets

def accuracy(y, y_pred):
    acc = np.sum(y == y_pred) / len(y)
    return acc

X, y = datasets.make_blobs(n_samples=150, n_features=2, centers=2, cluster_std=1.05, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


ModuleNotFoundError: No module named 'matplotlib'

In [6]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=100):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None


    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        self.w = np.zeros(n_features)
        self.b = 0
        y_ = np.where(y <= 0, -1, 1)

        for _ in range(self.n_iters):
            for idx, x in enumerate(X):
                condition = y_[idx] * (x @ self.w - self.b) >= 1
                if condition:
                    self.w -= self.learning_rate * 2 * self.lambda_param * self.w
                else:
                    self.w -= self.learning_rate * (2 * self.lambda_param * self.w - x @ y_[idx])
                    self.b -= self.learning_rate * y_[idx]


    def predict(self, X):
        y_pred = X @ self.w - self.b
        return np.sign(y_pred)
