# 0. numpy broadcasting
- 브로드캐스팅(Broadcasting)은 모양이 다른 배열들 간의 연산이 어떤 조건을 만족했을 때 가능해지도록 배열을 자동적으로 변환하는 것
- 차원이 동일하거나 둘 중 하나가 1차원이면 배열 차원을 쭉 늘려서 대강 연산가능하게 해줌
- 글고 넘파이는 벡터연산을 지원해서 배열연산보다 빨라용
- https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/numpy_practice.ipynb

In [1]:
import numpy as np

# 1. Linear Regression
- https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/linear_regression.ipynb
- 정규방정식 사용한다.
    - W = (X_T*X)^(−1)*X_T*y

In [337]:
# y= wx+b
class LinearRegression:
    def __init__(self, regul=0.1):
        self.regul = regul
        self.W = None
        self.b = None
        
    def predict(self, x):
        return np.dot(x, self.W)+self.b
    
    def fit(self, X, Y, lr=0.01, num_iter=1000):
        n_samples, n_features= X.shape
        # bias, weight
        self.W = np.zeros(n_features)
        self.b = 0
        for _ in range(num_iter):
            #for x, y in zip(X, Y):
            pred_y= np.dot(X, self.W)+self.b
            # 회귀: MSE=(y-pred_y)**2
            dw= (1 / n_samples) * np.dot(X.T, (y_pred - y))#2*(y-pred_y)*x
            db = (1 / n_samples) * np.sum(y_pred - y)
            self.W -= lr * dw
            self.b -= lr * db
        

In [338]:
# git
class LinearRegressionGD:
    def __init__(self, regul=0):
        self.regul = regul
        self.W = None

    def fit(self, X, y, lr=0.01, num_iter=1000):
        # Input validation
        if len(X) != len(y) or len(X) == 0:
            raise ValueError("X and y must have the same length and cannot be empty")
        
        # Add bias term to X -> [1 X]
        X = np.hstack([np.ones((len(X), 1)), X])

        # Initialize W to zeros
        self.W = np.zeros(X.shape[1])

        # Use gradient descent to minimize cost function
        for i in range(num_iter):
            # Calculate predicted values
            y_pred = np.dot(X, self.W)

            # Calculate cost function
            cost = np.sum((y_pred - y) ** 2) + self.regul * np.sum(self.W ** 2)

            # Calculate gradients
            gradients = 2 * np.dot(X.T, (y_pred - y)) + 2 * self.regul * self.W

            # Update W
            self.W = self.W - lr * gradients

            if (i % 1000 == 0 ): print(cost)

    def predict(self, X):
        # Add bias term to X
        X = np.hstack([np.ones((len(X), 1)), X])

        # Calculate predicted values
        y_pred = np.dot(X, self.W)
        return y_pred

In [342]:
# interview
class LinearRegression:
    def __init__(self):
        self.W = None

    def fit(self, X, y):
        '''
        X: n x d 
        '''
        # Add bias term to X -> [1 X]
        n = X.shape[0]
        X = np.hstack([np.ones((n, 1)), X])
        self.W = np.linalg.inv(X.T @ X) @ X.T @ y

    def predict(self, X):
        n = X.shape[0]
        X = np.hstack([X, np.ones((n, 1))])
        return X @ self.W

def linear_regression(features, labels):
    # np.c_[a, b]: (column) 방향으로 배열을 결합 
    
    # a = np.array([1, 2, 3]) 
    # b = np.array([4, 5, 6])  
    # [[1 4]
    # [2 5]
    # [3 6]]
    
    # a = np.array([[1, 2, 3]]) 
    # b = np.array([[4, 5, 6]])
    # [[1 2 3 4 5 6]]
    
    # a = np.array([1, 2, 3])
    # b = np.array([4, 5, 6])
    # c = np.array([7, 8, 9])
    # [[1 4 7]
    # [2 5 8]
    # [3 6 9]]
    
    feat_with_bias= np.c_[
        np.ones((features.shapee[0], 1)),
        features
    ]
    # (X_T*X)^(−1)X_Ty
    weights= np.linalg.inv(feat_with_bias.T.dot(feat_with_bias)) \
            .dot(feat_with_bias.T) \
            .dot(labels)
    return weights

In [340]:
X = np.array([[1, 2, 3, 4, 5]]).T
y = np.array([2, 4, 5, 4, 5])
lr = LinearRegression(regul=0.1)
lr.fit(X, y, lr=0.01, num_iter=10000)
print(lr.W)  # Output: [ 1.99964292  0.65345474 ]
y_pred = lr.predict(X)
print(y_pred)  # # Output: [2.65309766, 3.3065524, 3.96000714, 4.61346188, 5.26691662]

[inf]
[inf inf inf inf inf]


In [341]:
lr = LinearRegressionGD(regul=0.1)
lr.fit(X, y, lr=0.01, num_iter=10000)
print(lr.W)  # Output: [ 1.99964292  0.65345474 ]
y_pred = lr.predict(X)
print(y_pred)  # # Output: [2.65309766, 3.3065524, 3.96000714, 4.61346188, 5.26691662]

86.0
2.879128727013035
2.879128727013033
2.879128727013033
2.879128727013033
2.879128727013033
2.879128727013033
2.879128727013033
2.879128727013033
2.879128727013033
[1.99964292 0.65345474]
[2.65309766 3.3065524  3.96000714 4.61346188 5.26691662]


# 2. Logistic Regression
- https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/logistic_regression.ipynb

In [297]:
# y= sig(wx+b)
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.w = None
        self.b= None
    
    def sigmoid(self, z):
        return 1.0 / (1.0 + np.exp(-z))
    
    def predict(self, x):
        z= np.dot(x, self.w)+self.b
        pred_y= np.argmax(self.sigmoid(z))
        label= np.round(y_pred).astype(int)
        return label
    
    def fit(self, X, y):
        n_samples, n_features= X.shape
        # bias, weight 별도로 해야한다 왜 어떤건 같이하고 어떤건 따로하지 쩝
        #self.w= [np.ones(n_features), np.zeros(n_features)]
        self.w=np.zeros(n_features)
        self.b= 0
        for _ in range(self.n_iters):
            #for x, y in zip(X, Y): # 각각 안 할 거면
            z= np.dot(X, self.w)+self.b
            y_pred= self.sigmoid(z) #np.argmax(self.sigmoid(z))
            # 이진분류: BCE = -ylogP + (1-y)log(1-P)
            # dloss/dw= (-y) * y'(1-y')/y' * x + (1-y)* (y'(1-y'))/(1-y') * x
            # = (-y*(1-y')+(1-y)y') * x = (-y +yy'+y'-yy')*x = (y'-y) * x
#             dw= (pred_y- y)*x
#             w -= lr* dw
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)
            self.w -= self.learning_rate * dw
            self.b -= self.learning_rate * db

In [298]:
# git 
class LogisticRegressionGD:
    
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        
    def fit(self, X, y):
        # initialize weights and bias to zeros
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # gradient descent optimization
        for i in range(self.n_iters):
            # calculate predicted probabilities and cost
            z = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(z)
            cost = (-1 / n_samples) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
            
            # calculate gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)
            
            # update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
    def predict(self, X):
        # calculate predicted probabilities
        z = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(z)
        # convert probabilities to binary predictions
        return np.round(y_pred).astype(int)
    
    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

In [299]:
# interview

In [300]:
# create sample dataset
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
y = np.array([0, 0, 1, 1, 1])

# initialize logistic regression model
lr = LogisticRegression()

# train model on sample dataset
lr.fit(X, y)

# make predictions on new data
X_new = np.array([[6, 7], [7, 8]])
y_pred = lr.predict(X_new)

print(y_pred)  # [1, 1]

[1 1]


In [296]:
# ans
lr = LogisticRegressionGD()

# train model on sample dataset
lr.fit(X, y)

# make predictions on new data
X_new = np.array([[6, 7], [7, 8]])
y_pred = lr.predict(X_new)

print(y_pred)  # [1, 1]

[1 1]


# 3. Naive Bayes
https://johnjdailey.medium.com/naive-bayes-classifier-built-in-python-with-numpy-9f05ec26e373 
- 어떻게 구현하는지 알아놓기만..

# 4. K-means Clustering
- https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/k_means.ipynb

In [280]:
class KMeansClustering:
    def __init__(self, k=3, max_iter=10):
        self.centroids = None
        self.k =k
        self.max_iter= max_iter

    def predict(self, X):
        # Assign each data point to the nearest centroid
        cluster_assignments = []
        for j in range(len(X)):
            distances = np.linalg.norm(X[j] - self.centroids, axis=1)
            cluster_assignments.append(np.argmin(distances))
        return cluster_assignments
        
    def fit(self, X):
        self.centroids= X[np.random.choice(range(len(X)), self.k, replace=False)]
        #np.random.choice(X, self.k, replace=False)

        for i in range(self.max_iter):
            # 각 데이터 샘플에서 centroids까지의 거리 구하여 할당하기
            for j in range(len(X)):
                dist= np.linalg.norm(self.centroids-X[j], axis=1)
                assigned= np.argmin(dist)
            #if np.array_equal(prev_assigned, assigned): #np.array_equal는 np배열만 가능함
            if i > 0 and np.array_equal(prev_assinged, assigned):
                break
            # 수렴하지 않았으면 각 클러스터 평균좌표로 센트로이드 업데이트
            prev_assinged = np.copy(assigned)
            for k in range(self.k):
                data= X[np.where(assigned==k)]
                if len(data):
                    distances= np.linalg.norm(self.centroids[i]-data, axis=1)
                    self.centroids[k]= np.mean(distances, axis=0)

In [274]:
# git
class KMeans:
    def __init__(self, k, max_iterations=100):
        self.k = k
        self.max_iterations = max_iterations
        
    def fit(self, X):
        # Initialize centroids randomly
        self.centroids = X[np.random.choice(range(len(X)), self.k, replace=False)]
        
        for i in range(self.max_iterations):
            # Assign each data point to the nearest centroid
            cluster_assignments = []
            for j in range(len(X)):
                distances = np.linalg.norm(X[j] - self.centroids, axis=1)
                cluster_assignments.append(np.argmin(distances))
            
            # Update centroids
            for k in range(self.k):
                cluster_data_points = X[np.where(np.array(cluster_assignments) == k)]
                if len(cluster_data_points) > 0:
                    self.centroids[k] = np.mean(cluster_data_points, axis=0)
            
            # Check for convergence
            if i > 0 and np.array_equal(self.centroids, previous_centroids):
                break
            
            # Update previous centroids
            previous_centroids = np.copy(self.centroids)
        
        # Store the final cluster assignments
        self.cluster_assignments = cluster_assignments
    
    def predict(self, X):
        # Assign each data point to the nearest centroid
        cluster_assignments = []
        for j in range(len(X)):
            distances = np.linalg.norm(X[j] - self.centroids, axis=1)
            cluster_assignments.append(np.argmin(distances))
        
        return cluster_assignments

In [275]:
# interview
# inputs:  2D array(n, d) d차원 샘플 n개
# centroids: (k, d) d차원 센트로이드 k개
# np.linalg.norm(inputs-centroids, axis=2) 차원 달라서 계산 안 됨 브로드캐스팅 되도록 해야함
# inputs[:, np.newaxis]: inputs의 차원을 하나 더 늘리는 효과(n,1,d)
# inputs[:, np.newaxis]-centroids 결과 (n,k,d)가 되고, np.linalg.norm(-,axis=2) 마지막 축을 따라 유클리드 거리를 계산 (n,k)
def kmeans(inputs, k, max_iters=100):
    centroids= inputs[np.random.choice(range(len(input)), size=k, replace=False)]
    
    for _ in range(max_iters):
        distances= np.linalg.norm(inputs[:, np.newaxis]-centroids, axis=2)
        labels= np.argmin(distances, axis=1)
        new_centroids= np.array([inputs[labels==y].mean(axis=0) for i in range(k)])
        if np.allclose(centroids, new_centroids):# 두 배열이 허용오차 내에서 요소별로 동일한 경우 True를 반환
            break
        centroids= new_centroids
        
    return centroids, labels

In [281]:
x1 = np.random.randn(5,2) + 5
x2 = np.random.randn(5,2) - 5
X = np.concatenate([x1,x2], axis=0)

# Initialize the KMeans object with k=3
kmeans = KMeansClustering(k=2)

# Fit the k-means model to the dataset
kmeans.fit(X)

# Get the cluster assignments for the input dataset
cluster_assignments = kmeans.predict(X)

# Print the cluster assignments
print(cluster_assignments)

# Print the learned centroids
print(kmeans.centroids)

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[[0.         0.        ]
 [4.8029376  5.45459269]]


  data= X[np.where(assigned==k)]


In [258]:
x1 = np.random.randn(5,2) + 5
x2 = np.random.randn(5,2) - 5
X = np.concatenate([x1,x2], axis=0)
# Initialize the KMeans object with k=3
kmeans = KMeans(k=2)

# Fit the k-means model to the dataset
kmeans.fit(X)

# Get the cluster assignments for the input dataset
cluster_assignments = kmeans.predict(X)

# Print the cluster assignments
print(cluster_assignments)

# Print the learned centroids
print(kmeans.centroids)

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[[-4.35239932 -5.6328419 ]
 [ 4.86049412  4.67505199]]


# 5. KNN
- https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/knn.ipynb

In [None]:
class KNN:
    def __init__(self, X, k=5):
        self.k= k
        self.X= X
        
    def predict(self, x):
        # 최근접 5개 데이터의 레이블 (학습할 게 있나..?)
        distances= np.array(np.linalg.norm(self.centroids[i]-data, axis=2))
        label= np.mean(np.argsort(distances[:k]))
        # 평균X 최빈 레이블O
        return label

In [None]:
# git
from collections import Counter
import numpy as np

class KNN:
    def __init__(self, k=3, distance='euclidean'):
        self.k = k
        self.distance = distance
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            # Compute distances between the test point and all training points
            if self.distance == 'euclidean':
                distances = np.linalg.norm(self.X_train - x, axis=1)
            elif self.distance == 'manhattan':
                distances = np.sum(np.abs(self.X_train - x), axis=1)
            else:
                distances = np.power(np.sum(np.power(np.abs(self.X_train - x), self.distance), axis=1), 1/self.distance)
                
            # Select the k nearest neighbors
            nearest_indices = np.argsort(distances)[:self.k]
            nearest_labels = self.y_train[nearest_indices]
            
            # Assign the class label that appears most frequently among the k nearest neighbors
            label = Counter(nearest_labels).most_common(1)[0][0]
            y_pred.append(label)
        
        return np.array(y_pred)

In [None]:
# interview
def knn(query, candidates, k):
    dist=np.linalg.norm(candidates-query, axis=1)
    # np.argsort(dist)[:k] # 배열을 정렬하고 그 정렬된 배열의 인덱스를 반환,정확한 순서가 필요할 때
    # 배열에서 k번째로 작은 원소를 기준으로 배열을 부분적으로 정렬,순서가 중요하지 않고 상위 k개의 원소만 필요할 때
    return np.argpartition(dist, k)[:k] 

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the iris dataset
iris = load_iris()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Create a KNN classifier with k=5 and euclidean distance
knn = KNN(k=5, distance='euclidean')

# Train the classifier on the training data
knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test)

# Compute the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the iris dataset
iris = load_iris()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(iris.data[:, :2], iris.target, test_size=0.2, random_state=42)

# Create a KNN classifier with k=5 and euclidean distance
knn = KNN(k=5, distance='euclidean')

# Train the classifier on the training data
knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test)

# Create scatter plots of the test data with colored points representing the true and predicted labels
fig, ax = plt.subplots()
scatter1 = ax.scatter(X_test[y_test==0, 0], X_test[y_test==0, 1], c='b', cmap='viridis', label=iris.target_names[0])
scatter2 = ax.scatter(X_test[y_test==1, 0], X_test[y_test==1, 1], c='g', cmap='viridis', label=iris.target_names[1])
scatter3 = ax.scatter(X_test[y_test==2, 0], X_test[y_test==2, 1], c='r', cmap='viridis', label=iris.target_names[2])
scatter4 = ax.scatter(X_test[:, 0], X_test[:, 1], c='k', cmap='viridis', marker='x', label='Predicted Label')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_title('KNN Classifier Results')
handles = [scatter1, scatter2, scatter3, scatter4]
labels = [h.get_label() for h in handles]
ax.legend(handles=handles, labels=labels)
plt.show()

# 6. Decision Tree(★★★★★)
- https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/decision_tree.ipynb
- Gini 계수가 낮을수록 노드의 순도가 높다

In [252]:
class DecisionTree_my:
    def __init__(self, max_depth=10):
        self.max_depth=max_depth
        self.root= None
        
    def _gini(self, y): 
#         classes= np.unique(y)[0]
#         classed_portions= np.array([ np.sum(np.where(y==c))/np.sum(y) for c in classes])
#         gini= np.sum([cp**2 in cp in classed_portions])
#         return gini
        _, counts = np.unique(y, return_counts=True)
        impurity = 1 - np.sum([(count / len(y)) ** 2 for count in counts])
        return impurity
    
    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)
    
    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node= Node_my(predicted_class=predicted_class) 
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    
    def _best_split(self, X, y):
        # best_gini, best_feature_idx, best_feature_threshold= self.gini(y), -1, -1
        m = y.size
        if m <= 1:
            return None, None
        
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        # 이 데이터 기준분할 X 전체에 대해
        # n_samples, n_features= X.shape
        for i in range(self.n_features_):
            # data = X[:, i] 스킵하려면 정렬해야함~ 
            thresholds, classes = zip(*sorted(zip(X[:, i], y)))
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for j in range(1, m):
#                 if thresholds[i] == thresholds[i - 1]: thresholds[i]가 0일 때 계산 안하고 넘어가버림..?
#                     continue
#                 gini_l = self.gini(y[:j]) / (j+1)
#                 gini_r = self.gini(y[j+1:]) / (n_samples-j) 가중치(전체 대비 부분셋 크기)여야지 나누는게 아니라..
                c = classes[j - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / j) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - j)) ** 2 for x in range(self.n_classes_)
                )
                gini = (j * gini_left + (m - j) * gini_right) / m
                if thresholds[j] == thresholds[j-1]: 
                    continue
                if gini < best_gini:
                    best_gini, best_idx =gini, i 
                    best_thr = (thresholds[j] + thresholds[j - 1]) / 2 # thresholds[j]
        return best_idx, best_thr
                
    def predict(self, X):
        return [self._predict(inputs) for inputs in X]
    
    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node
        
    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class
    
class Node:
    def __init__(self, *, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0.0 
        self.left = None
        self.right = None

    def is_leaf_node(self):
        return self.left is None and self.right is None

In [253]:
# git
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        
    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)
        
    def predict(self, X):
        return [self._predict(inputs) for inputs in X]
        
    def _gini(self, y):
        _, counts = np.unique(y, return_counts=True)
        impurity = 1 - np.sum([(count / len(y)) ** 2 for count in counts])
        return impurity
        
    def _best_split(self, X, y):
        m = y.size
        if m <= 1:
            return None, None
        
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        
        for idx in range(self.n_features_):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr
        
    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node
        
    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class
    
class Node:
    def __init__(self, *, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0.0 
        self.left = None
        self.right = None

    def is_leaf_node(self):
        return self.left is None and self.right is None

In [254]:
# interview
# 정보이득: 특정 피처값을 기반으로 데이터셋을 분할했을 때 얻는 엔트로피의 감소량
def entropy(labels):
    # np.unique(arr) 고유 요소 추출: 2D 배열에서 고유한 행이나 열을 찾기
    # return_index: 고유 요소가 원래 배열에서 나타나는 첫 번째 인덱스를 반환합니다.
    # return_inverse: 원래 배열을 재구성하는 데 사용할 수 있는 인덱스를 반환합니다.
    # return_counts: 각 고유 요소의 빈도수를 반환합니다.
    _, counts= np.unique(labels, return_counts=True)
    probs= counts/len(labels)
    entropy= -np.sum(probs*np.log2(probs))
    return entropy

def information_gain(features, labels, split_index):
    feat_split= features[:, split_index]
    unique_values, counts= np.unique(feat_split, return_counts=True)
    probs = counts / len(feat_split)
    weight_entropies = [
        prob * entropy(labels[feat_split==value]) for value, prob in zip(unique_values, probs)
    ]
    return entropy(labels) - np.sum(weight_entropies)

In [255]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [256]:
# Train the decision tree
tree = DecisionTree_my(max_depth=3)
tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = tree.predict(X_test)

# Compute the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [241]:
# ans
tree = DecisionTree(max_depth=3)
tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = tree.predict(X_test)

# Compute the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 1.0


# 7. Random Forest
https://github.com/sachaMorin/np-random-forest/blob/master/Forest.py 
- 어떻게 구현하는지 알아놓기만..
```
max_depth(int): Max depth of trees.
no_trees(int): Number of trees.
min_samples_split(int): Number of samples in a node to allow
split search.
min_samples_leaf(int): Number of samples to be deemed a leaf node.
feature_search(int): Number of features to search when splitting.
bootstrap(boolean): Resample dataset with replacement
```

# 8. SVM(★★★★★)
- https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/svm.ipynb
- Hinge Loss(★★★★★)
    - max(0,1−y⋅f(x))
    - 주로 분류 문제, 특히 Support Vector Machine(SVM)에서 사용
- Huber Loss
    - 0.5*∣a∣^2 if ∣a∣≤δ else δ(∣a∣-0.5*δ)
    - 회귀, 작은 오류에 대해서는 MSE처럼 동작하고, 큰 오류에 대해서는 MAE처럼 동작
    - 작은 오류에 대해서는 제곱 오차를, 큰 오류에 대해서는 절대 오차를 사용하여 이상치에 덜 민감하게 합니다.

In [101]:
class SVM_my:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.w= None
        self.b= None
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        
    def predict(self, X):
        z= np.dot(X, self.w)+self.b
        return np.sign(z) # np.where(z < 0 , -1, 1)
    
    def fit(self, X, y):
        n_samples, n_features= X.shape
        #self.w, self.b= np.zeros(n_features), np.ones(n_features)
        y_ = np.where(y <= 0, -1, 1)
        self.w = np.zeros(n_features)
        self.b = 0
        
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                # pred_y= np.dot(self.w.T, X)+self.b
                # huber_loss = max(0, 1- np.abs(y-pred_y))
                # hinge_loss임 huber_loss도 아님ㅜㅜㅋㅋㅋ
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                    # dw= X + regl * 2 * np.sum(w)
                    # db= n_samples
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]
                    # dw= regl * 2 * np.sum(w)
#                 self.w -= lr * dw
#                 self.b -= lr * db
                

In [102]:
# git
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        y_ = np.where(y <= 0, -1, 1)
        self.w = np.zeros(n_features)
        self.b = 0

        # Gradient descent
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]

    def predict(self, X):
        linear_output = np.dot(X, self.w) - self.b
        return np.sign(linear_output)

In [103]:
# Example usage
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, y = datasets.make_blobs(n_samples=100, centers=2, random_state=42)
y = np.where(y == 0, -1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm = SVM_my()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)


# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [104]:
# ans
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, y = datasets.make_blobs(n_samples=100, centers=2, random_state=42)
y = np.where(y == 0, -1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm = SVM()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)


# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


# 9. FFNN
- https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/feedforward.ipynb

In [75]:
class TwoLayerNet_my:
    # 모델 파라미터를 받아야한다)
    def __init__(self,input_size=2, hidden_size=10, output_size=2):
        #self.params={'w1':None, 'w2':None, 'b1':None, 'b2':None}
        self.params = {}
        self.params['W1'] = np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
    
    def softmax(self, z):
#         exp=np.exp(z)
#         exp_sum= np.sum(exp)
#         probs= np.array([exp[i]/exp_sum for i in range(len(z))])
        # sample별 probs 뽑아내야하기 때문에 probs 1차원 배열로 반환하면 X 
        # predictions = np.argmax(probs, axis=1)에서 각 샘플에 대해 가장 높은 확률의 인덱스를 찾음
        exp_z = np.exp(z)
        probs = exp_z / np.sum(exp_z, axis=1, keepdims=True)
        return probs
    
    def forward(self, X):
        z1 = np.dot(X, self.params['W1']) + self.params['b1']
        a1 = np.where(z1 < 0, 0, z1) # ReLU
        z2 = np.dot(a1, self.params['W2']) +self.params['b2']
        #a2 = np.where(z2 < 0, 0, z2) # 마지막 층에는 softmax만 취함 활성화X
        pred_y= self.softmax(z2)
        return pred_y
        
    
    def train(self, X, y, num_epochs=1000, regl=0.03, lr=0.01):
#         n_samples, n_features= X.shape
#         self.params['w1']=np.zeros(n_features)
#         self.params['w2']=np.zeros(n_features)
#         self.params['b1']=np.ones(n_features)
#         self.params['b2']=np.ones(n_features)        
        for _ in range(num_epochs):
            # minibatch
            z1 = np.dot(X, self.params['W1']) + self.params['b1']
            a1 = np.where(z1 < 0, 0, z1) # ReLU: np.maximum(0, z1)
            z2 = np.dot(a1, self.params['W2']) +self.params['b2']
            probs= self.softmax(z2)
            
            delta3 = probs
            delta3[range(len(X)), y] -= 1
            dw2 = np.dot(a1.T, delta3) # np.dot(delta3, a1)
            db2 = np.sum(delta3, axis=0) # delta3
            
            delta2 = np.dot(delta3, self.params['W2'].T)*(a1 > 0)
            dw1 =  np.dot(X.T, delta2) # np.dot(delta2, X)
            db1 = np.sum(delta2, axis=0) # delta2
            
            # regularization(regl),optimization(lr)
            self.params['W1'] -= lr*dw1
            self.params['b1'] -= lr*db1
            self.params['W2'] -= lr*dw2
            self.params['b2'] -= lr*db2        
        

In [76]:
# git
import numpy as np

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size):
        self.params = {}
        self.params['W1'] = np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def forward(self, X):
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        z1 = np.dot(X, W1) + b1
        a1 = np.maximum(0, z1) # ReLU activation function
        z2 = np.dot(a1, W2) + b2
        # probs = 1 / (1 + np.exp(-z2)) # Sigmoid activation function
        exp_z = np.exp(z2)
        probs = exp_z / np.sum(exp_z, axis=1, keepdims=True)
        return probs

    def loss(self, X, y):
        probs = self.forward(X)
        correct_logprobs = -np.log(probs[range(len(X)), y])
        data_loss = np.sum(correct_logprobs)
        return 1.0/len(X) * data_loss

    def train(self, X, y, num_epochs, learning_rate=0.1):
        for epoch in range(num_epochs):
            # Forward propagation
            z1 = np.dot(X, self.params['W1']) + self.params['b1']
            a1 = np.maximum(0, z1) # ReLU activation function
            z2 = np.dot(a1, self.params['W2']) + self.params['b2']
            # probs = 1 / (1 + np.exp(-z2)) # Sigmoid activation function
            exp_z = np.exp(z2)
            probs = exp_z / np.sum(exp_z, axis=1, keepdims=True)

            # Backpropagation
            delta3 = probs
            delta3[range(len(X)), y] -= 1
            dW2 = np.dot(a1.T, delta3)
            db2 = np.sum(delta3, axis=0)
            delta2 = np.dot(delta3, self.params['W2'].T) * (a1 > 0) # derivative of ReLU
            dW1 = np.dot(X.T, delta2)
            db1 = np.sum(delta2, axis=0)

            # Update parameters
            self.params['W1'] -= learning_rate * dW1
            self.params['b1'] -= learning_rate * db1
            self.params['W2'] -= learning_rate * dW2
            self.params['b2'] -= learning_rate * db2

            # Print loss for monitoring training progress
            if epoch % 100 == 0:
                loss = self.loss(X, y)
                print("Epoch {}: loss = {}".format(epoch, loss))

In [77]:
# Generate a toy dataset
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0])

# Initialize a neural network
net = TwoLayerNet_my(input_size=2, hidden_size=10, output_size=2)

# Train the neural network
net.train(X, y, num_epochs=1000)

# Test the neural network
probs = net.forward(X)
predictions = np.argmax(probs, axis=1)
print("Predictions: ", predictions)

Predictions:  [0 1 1 0]


In [78]:
# ans
# Generate a toy dataset
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0])

# Initialize a neural network
net = TwoLayerNet(input_size=2, hidden_size=10, output_size=2)

# Train the neural network
net.train(X, y, num_epochs=1000)

# Test the neural network
probs = net.forward(X)
predictions = np.argmax(probs, axis=1)
print("Predictions: ", predictions)

Epoch 0: loss = 0.4687468424676198
Epoch 100: loss = 0.016902136699540674
Epoch 200: loss = 0.007560813257671267
Epoch 300: loss = 0.004648987758144024
Epoch 400: loss = 0.003279740741363126
Epoch 500: loss = 0.002501064671430566
Epoch 600: loss = 0.002003939050550834
Epoch 700: loss = 0.0016631743024980625
Epoch 800: loss = 0.001415092124546393
Epoch 900: loss = 0.0012278644788697665
Predictions:  [0 1 1 0]


# 10. Perceptron
- https://github.com/alirezadir/Machine-Learning-Interviews/blob/main/src/MLC/notebooks/perceptron.ipynb

In [30]:
class Perceptron_my:
    def __init__(self, lr=0.01, n_iter=100):
        self.w=None
        self.lr = lr
        self.n_iter = n_iter
        
    def predict(self, X):
        z= np.dot(X, self.w[1:])+self.w[0]
        y= np.where(z  >= 0.0, 1, -1) #np.where(z < 0, 0, 1)
        return y
    
    def fit(self, X, Y):
        n_samples, n_features= X.shape
        self.w= np.zeros(1 + X.shape[1]) # np.array([np.ones(n_features), np.zeros(n_features)])
        for _ in range(self.n_iter):
            for x, y in zip(X, Y):
                #sign까지 통과한 pred 값으로 error 구해야함, abs아님
                #z= np.dot(X, self.w[1:])+self.w[0]
                #dw= 2*np.abs(y-self.predict(x))
                delta= y-self.predict(x)
                # weights = delta * x, bias = sum(delta)
                self.w[1:] += self.lr*delta*x # -= 아님(정답 방향으로 이동할 것 인데 y-y'했으니까 음수차이나면 음수 그대로 누적)
                self.w[0] += self.lr*delta

In [31]:
# git
import numpy as np

class Perceptron:
    def __init__(self, lr=0.01, n_iter=100):
        self.lr = lr
        self.n_iter = n_iter

    def fit(self, X, y):
        self.weights = np.zeros(1 + X.shape[1])
        self.errors = []

        for _ in range(self.n_iter):
            errors = 0
            for xi, target in zip(X, y):
                update = self.lr * (target - self.predict(xi))
                self.weights[1:] += update * xi
                self.weights[0] += update
                errors += int(update != 0.0)
            self.errors.append(errors)
        return self

    def net_input(self, X):
        return np.dot(X, self.weights[1:]) + self.weights[0]

    def predict(self, X):
        return np.where(self.net_input(X) >= 0.0, 1, -1)

In [32]:
X = np.array([[2.0, 1.0], [3.0, 4.0], [4.0, 2.0], [3.0, 1.0]])
y = np.array([-1, 1, 1, -1])
perceptron = Perceptron_my()
perceptron.fit(X, y)

new_X = np.array([[5.0, 2.0], [1.0, 3.0]])
perceptron.predict(new_X)

array([-1,  1])

In [33]:
#ans
X = np.array([[2.0, 1.0], [3.0, 4.0], [4.0, 2.0], [3.0, 1.0]])
y = np.array([-1, 1, 1, -1])
perceptron = Perceptron()
perceptron.fit(X, y)

new_X = np.array([[5.0, 2.0], [1.0, 3.0]])
perceptron.predict(new_X)

array([-1,  1])

# 11. 최적화 기법들
1. Adam Optimizer
2. Regulation
3. GD/SGD
4. Initialization
5. Validation

# 12. 평가코드 템플릿
- cost, 일정 epoch마다 출력