In [None]:
# 1. Linear Regression
import numpy as np

class LinearRegression:
    def __init__(self):
        self.w = None
        self.b = None
    
    def fit(self, X, y):
        # Calculate the mean of X and y
        X_mean = np.mean(X)
        y_mean = np.mean(y)
        
        # Calculate the variance of X and y
        X_var = np.var(X)
        y_var = np.var(y)
        
        # Calculate the covariance of X and y
        X_y_cov = np.cov(X, y)[0][1]
        
        # Calculate the weights w and b
        self.w = X_y_cov / X_var
        self.b = y_mean - self.w * X_mean
    
    def predict(self, X):
        return self.w * X + self.b


In [None]:
# 2. Logistic Regression
import numpy as np

# Sigmoid function
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

# Derivative of the sigmoid function
def sigmoid_derivative(x):
  return x * (1 - x)

# Loss function
def log_loss(y_true, y_pred):
  return -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Input data
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

# Initialize weights and bias
weights = np.random.rand(2, 1)
bias = np.random.rand(1)

# Set learning rate and number of epochs
learning_rate = 0.1
num_epochs = 10000

# Train the model
for epoch in range(num_epochs):
  # Forward pass
  z = np.dot(X, weights) + bias
  y_pred = sigmoid(z)

  # Compute loss
  loss = log_loss(y, y_pred)
  loss = np.mean(loss)

  # Backward pass
  derivative_loss_y_pred = y_pred - y
  derivative_y_pred_z = sigmoid_derivative(y_pred)
  derivative_loss_z = derivative_loss_y_pred * derivative_y_pred_z
  derivative_z_weights = X
  derivative_loss_weights = np.dot(derivative_z_weights.T, derivative_loss_z) / len(X)
  derivative_loss_bias = np.mean(derivative_loss_z)

  # Update weights and bias
  weights -= learning_rate * derivative_loss_weights
  bias -= learning_rate * derivative_loss_bias

# Print the final weights and bias
print(weights)
print(bias)

# Test the model
test_input = np.array([[1, 1]])
test_output = sigmoid(np.dot(test_input, weights) + bias)
print(test_output)


In [None]:
import numpy as np

class DecisionTree:
    def __init__(self, criterion, max_depth):
        self.criterion = criterion
        self.max_depth = max_depth
    
    def fit(self, X, y):
        # Calculate impurity measure
        if self.criterion == 'gini':
            self.impurity = self._gini
        elif self.criterion == 'entropy':
            self.impurity = self._entropy
        else:
            raise ValueError("Invalid criterion")
        
        # Find the best question and split the data accordingly
        self.tree = self._build_tree(X, y, depth=1)
    
    def _build_tree(self, X, y, depth):
        # Count number of samples and unique labels
        num_samples, num_features = X.shape
        unique_labels = np.unique(y)
        
        # Calculate initial impurity of the data
        initial_impurity = self.impurity(y)
        
        # Initialize variables to keep track of the best split
        best_gain = 0
        best_question = None
        best_groups = None
        
        # Loop through all features and values to find the best split
        for feature_index in range(num_features):
            # Get values of the current feature
            feature_values = X[:, feature_index]
            
            # Loop through unique values of the feature
            for feature_value in np.unique(feature_values):
                # Create the question for this feature and value
                question = Question(feature_index, feature_value)
                
                # Split the data according to the question
                groups = self._split(X, y, question)
                
                # Calculate the impurity gain from the split
                impurity_gain = self._impurity_gain(groups, initial_impurity)
                
                # Update the best split if necessary
                if impurity_gain > best_gain:
                    best_gain = impurity_gain
                    best_question = question
                    best_groups = groups
        
        # Stop if no further improvement can be made or if max depth is reached
        if best_gain == 0 or depth == self.max_depth:
            return Leaf(y)
        
        # Recursively build the left and right branches of the tree
        left_branch = self._build_tree(best_groups[0], best_groups[1], depth + 1)
        right_branch = self._build_tree(best_groups[2], best_groups[3], depth + 1)
        
        # Return a node with the best question and the left and right branches
        return Node(best_question, left_branch, right_branch)
    
    def predict(self, X):
        # Traverse the tree and make predictions
        return self._traverse_tree(X, self.tree)
    
    def _traverse_tree(self, X, node):
        # Base case: reached a leaf node
        if isinstance(node, Leaf):
            return node.predictions
        
        # Decide whether to go left or right based on the question


In [None]:
import numpy as np

class NaiveBayes:

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)
            

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = posterior + prior
            posteriors.append(posterior)

        # return class with the highest posterior
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [None]:
import numpy as np

def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

def KNN(train_data, test_data, K):
    distances = []
    targets = []

    # Tính khoảng cách euclidean giữa mỗi điểm dữ liệu trong tập huấn luyện và điểm dữ liệu trong tập kiểm tra
    for i in range(len(train_data)):
        distance = euclidean_distance(test_data, train_data[i][:-1])
        distances.append([distance, i])
    
    # Sắp xếp các khoảng cách theo thứ tự tăng dần
    distances = sorted(distances)

    # Lấy nhãn của K điểm dữ liệu gần nhất trong tập huấn luyện
    for i in range(K):
        index = distances[i][1]
        targets.append(train_data[index][-1])

    # Trả về nhãn dự đoán cho tập kiểm tra dựa trên K điểm dữ liệu gần nhất
    return max(set(targets), key = targets.count)

# Đọc dữ liệu từ tệp csv và chuyển dữ liệu sang dạng numpy array
import pandas as pd
data = pd.read_csv("data.csv")
data = data.values

# Tách dữ liệu thành tập huấn luyện và tập kiểm tra
train_data = data[:int(0.8 * len(data))]
test_data = data[int(0.8 * len(data)):]

# Lấy giá trị K
K = 5

# Dự đoán nhãn cho tập kiểm tra


In [None]:
import numpy as np
import matplotlib.pyplot as plt

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1-x2)**2))

class KMeans:

    def __init__(self, K=5, max_iters=100, plot_steps=False):
        self.K = K
        self.max_iters = max_iters
        self.plot_steps = plot_steps

        # list of sample indices for each cluster
        self.clusters = [[] for _ in range(self.K)]

        # the centers (mean vector) for each cluster
        self.centroids = []


    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape

        # initialize
        random_sample_idxs = np.random.choice(self.n_samples, self.K, replace=False)
        self.centroids = [self.X[idx] for idx in random_sample_idxs]

        # optimize clusters
        for _ in range(self.max_iters):
            # assign samples to closest centroids (create clusters)
            self.clusters = self._create_clusters(self.centroids)

            if self.plot_steps:
                self.plot()

            # calculate new centroids from the clusters
            centroids_old = self.centroids
            self.centroids = self._get_centroids(self.clusters)

            if self._is_converged(centroids_old, self.centroids):
                break

            if self.plot_steps:
                self.plot()

        # classify samples as the index of their clusters
        return self._get_cluster_labels(self.clusters)


    def _get_cluster_labels(self, clusters):
        # each sample will get the label of the cluster it was assigned to
        labels = np.empty(self.n_samples)
        for cluster_idx, cluster in enumerate(clusters):
            for sample_idx in cluster:
                labels[sample_idx] = cluster_idx

        return labels


    def _create_clusters(self, centroids):
        # assign the samples to the closest centroids
        clusters = [[] for _ in range(self.K)]
        for idx, sample in enumerate(self.X):
            centroid_idx = self._closest_centroid(sample, centroids)
            clusters[centroid_idx].append(idx)
        return clusters

    def _closest_centroid(self, sample, centroids):
        # distance of the current sample to each centroid
        distances = [euclidean_distance(sample, point) for point in centroids]
        closest_idx = np.argmin(distances)
        return closest_idx


    def _get_centroids(self, clusters):
        # assign mean value of clusters to centroids
        centroids = np.zeros((self.K, self.n_features))
        for cluster_idx, cluster in enumerate(clusters):
            cluster_mean = np.mean(self.X[cluster], axis=0)
            centroids[cluster_idx] = cluster_mean
        return centroids

    def _is_converged(self, centroids_old, centroids):
        # distances between old and new centroids, for all centroids
        distances = [euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K)]
        return sum(distances) == 0

    def plot(self):
        fig, ax = plt.subplots(figsize=(12, 8))

        for i, index in enumerate(self.clusters):
            point = self.X[index].T
            ax.scatter(*point)

        for point in self.centroids:
            ax.scatter(*point, marker="x", color="black", linewidth=2)

        plt.show()

In [None]:
import numpy as np


class PCA:

    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None

    def fit(self, X):
        # mean centering
        self.mean = np.mean(X, axis=0)
        X = X -  self.mean

        # covariance, functions needs samples as columns
        cov = np.cov(X.T)

        # eigenvectors, eigenvalues
        eigenvectors, eigenvalues = np.linalg.eig(cov)

        # eigenvectors v = [:, i] column vector, transpose this for easier calculations
        eigenvectors = eigenvectors.T

        # sort eigenvectors
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[idxs]

        self.components = eigenvectors[:self.n_components]

    def transform(self, X):
        # projects data
        X = X - self.mean
        return np.dot(X, self.components.T)