# K-Nearest Neighbors (KNN)
- One of basic supervised leaarning
- Also called **lazy learning** or **Memory-based Learning** because in training phase, it just store the value of traning data -> ***Non-Parametric learning***
- In predict phase, model will use difference type of distance metrics (distance measures):
    + Euclidean
    + Chebyshev
    + Manhattan
    + Minkowski
- These distance metrics calculate the distance between the input and other values storing in the model, then ranking and finding **K** values have the nearest output. Finally, voting to find the output.
![Alt text](resources/training_and_testing_1.png)

- The KNN algorithm is widely used for classification and regression applications. 
![Alt text](resources/training_and_testing_2.png)

In [None]:
# Question 1D
# Question 2C

# Question 3 - B

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the diabetes dataset
iris_X, iris_y = datasets.load_iris(return_X_y=True)

# Split train:test = 8:2
X_train, X_test, y_train, y_test = train_test_split(
iris_X,
iris_y,
test_size=0.2,
random_state=42
)
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Build KNN Classifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Predict and Evaluate test set
y_pred = knn_classifier.predict(X_test)
accuracy_score(y_test, y_pred)


In [None]:
# Question 4B

# Question 5 - A

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error


# Paragraph A - Load the diabetes dataset 
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Paragraph D: Split train:test = 8:2
X_train, X_test, y_train, y_test = train_test_split(
diabetes_X,
diabetes_y,
test_size=0.2,
random_state=42
)

# Paragraph B: Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Paragraph C: Build KNN model
knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train, y_train)

# Predict and Evaluate test set
y_pred = knn_regressor.predict(X_test)
mean_squared_error(y_test, y_pred)

In [None]:
# Question 6D

# Question 7 - C

In [None]:
# Import library
import numpy as np
from datasets import load_dataset
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

# Load IMDB dataset
imdb = load_dataset("imdb")
imdb_train, imdb_test = imdb['train'], imdb['test']

# Convert text to vector using BoW
vectorizer = CountVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(imdb_train['text']).toarray()
X_test = vectorizer.transform(imdb_test['text']).toarray()
y_train = np.array(imdb_train['label'])
y_test = np.array(imdb_test['label'])
                            
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build KNN Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree')
knn_classifier.fit(X_train, y_train)

# predict test set and evaluate
y_pred = knn_classifier.predict(X_test)
accuracy_score(y_test, y_pred)

# K-Mean
- K-Means is a popular clutering algorithm widely used in machine learning and data mining tasks. The goal of K-means is divide **n value points** into **k clusters** such that the points in the same cluster have the highest similarity.
- The proccess of K-means including:
    + **Initialization:** Randomly select ***k points*** from the dataset as the initial cluster centroids.
    + **Assigment:** For each data point, calculate the distance from this point to all cluster centroids and assign it to the closet cluster. The distance is calculated by using ***Eucliddean distance***
    + **Update centroids:** Update the centroid of each cluster by calculating the mean of the points in that cluster
    + **Repeat**: Repeat the assignment and centroid update steps until the centroids no longer change or the change is minimal.

In [None]:
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as plt

data = np.array([
    [2.0, 3.0, 1.5],
    [3.0, 3.5, 2.0],
    [3.5, 3.0, 2.5],
    [7.5, 8.0, 7.5],
    [8.5, 8.5, 8.0],
    [9.0, 8.0, 8.5],
    [1.0, 2.0, 1.0],
    [1.5, 2.5, 1.5]
])

In [None]:
import numpy as np
import matplotlib.pyplot as plt

class KMeans:
    def __init__(self, k=3, max_iters=100):
        self.k = k
        self.max_iters = max_iters
        self.centroids = None
        self.clusters = None

    def initialize_centroids(self, data):
        np.random.seed(42)
        self.centroids = data[np.random.choice(data.shape[0], self.k, replace=False)]

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum(np.power(x1 - x2, 2)))

    def assign_clusters(self, data):
        distances = np.array([[self.euclidean_distance(x, centroid) for centroid in self.centroids] for x in data])
        return np.argmin(distances, axis=1)

    def update_centroids(self, data):
        return np.array([data[self.clusters == i].mean(axis=0) for i in range(self.k)])

    def fit(self, data):
        self.initialize_centroids(data)

        for i in range(self.max_iters):
            self.clusters = self.assign_clusters(data)

            self.plot_clusters(data, i)

            new_centroids = self.update_centroids(data)

            if np.all(self.centroids == new_centroids):
                break

            self.centroids = new_centroids

        self.plot_final_clusters(data)

    def plot_clusters(self, data, iteration):
        plt.scatter(data[:, 0], data[:, 1], c=self.clusters, cmap='viridis', marker='o', alpha=0.6)
        plt.scatter(self.centroids[:, 0], self.centroids[:, 1], s=300, c='red', marker='x')
        plt.title(f"Iteration {iteration + 1}")
        plt.show()

    def plot_final_clusters(self, data):
        plt.scatter(data[:, 0], data[:, 1], c=self.clusters, cmap='viridis', marker='o', alpha=0.6)
        plt.scatter(self.centroids[:, 0], self.centroids[:, 1], s=300, c='red', marker='x')
        plt.title("Final Clusters and Centroids")
        plt.show()

In [None]:
kmeans = KMeans(k=3)
kmeans.fit(data)

In [None]:
# Question 8C
# Question 9D
# Question 10B
# Question 11C
# Question 12A
# Question 13B