In [None]:
# 1. What is the KNN algorithm?
# KNN (K-Nearest Neighbors) is a simple, non-parametric, and lazy machine learning algorithm
# used for classification and regression tasks. It works by finding the 'k' closest data points
# to the test point and then classifying or averaging the target value based on those neighbors.

# Example:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train, y_train)

# Test the model
accuracy = knn.score(X_test, y_test)
print(f"Accuracy of KNN classifier: {accuracy:.2f}")

# 2. How do you choose the value of K in KNN?
# The value of 'K' determines the number of nearest neighbors to consider for prediction.
# A smaller K can be sensitive to noise, while a larger K can smooth the predictions.
# Typically, you can use cross-validation to find the best value of K.
# Here's how to select the optimal K using GridSearchCV:

from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': range(1, 21)}  # Testing K values from 1 to 20
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(f"Best K value: {grid_search.best_params_['n_neighbors']}")

# 3. What is the difference between KNN classifier and KNN regressor?
# The KNN classifier is used for classification tasks, where the output is a category or class label.
# The KNN regressor is used for regression tasks, where the output is a continuous value.
# In the classifier, the majority class of the k neighbors is returned, while in the regressor,
# the mean or median of the neighbors' target values is returned.

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Example for KNN regressor:
knn_regressor = KNeighborsRegressor(n_neighbors=3)
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)

# Calculate performance of the regressor
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error of KNN Regressor: {mse:.2f}")

# 4. How do you measure the performance of KNN?
# For classification tasks, performance is typically measured using accuracy, precision, recall,
# F1-score, or the ROC curve. For regression tasks, performance is measured using metrics like
# Mean Squared Error (MSE), Mean Absolute Error (MAE), or R-squared.

# Performance in classification (accuracy is shown above), for regression:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f"R-squared for KNN Regressor: {r2:.2f}")

# 5. What is the curse of dimensionality in KNN?
# The curse of dimensionality refers to the phenomenon where as the number of features increases,
# the distance between points increases, and KNN becomes less effective. In high-dimensional spaces,
# the concept of "nearness" becomes less meaningful, and the model may suffer from poor performance.

# 6. How do you handle missing values in KNN?
# KNN can handle missing values by imputing them using the mean, median, or mode of the nearest neighbors.
# However, this may introduce some bias. It's recommended to impute missing values before applying KNN.
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# 7. Compare and contrast the performance of the KNN classifier and regressor.
# KNN classifier works well with categorical features, and its performance improves when the decision boundary
# is relatively smooth. KNN regressor is more suitable when you need to predict continuous values. Both may suffer
# from the curse of dimensionality and require feature scaling for optimal performance.

# 8. What are the strengths and weaknesses of the KNN algorithm for classification and regression tasks?
# Strengths:
# - Simple and intuitive.
# - Effective for small datasets with low dimensionality.
# - No need for training phase (lazy learning).
# Weaknesses:
# - High computational cost for large datasets.
# - Sensitive to irrelevant or redundant features (curse of dimensionality).
# - Performance is significantly impacted by the choice of K and distance metric.

# 9. What is the difference between Euclidean distance and Manhattan distance in KNN?
# Euclidean distance is the straight-line distance between two points, calculated as the square root of
# the sum of squared differences between the coordinates.
# Manhattan distance is the sum of the absolute differences between the coordinates.

# Example:
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances

# Compute Euclidean and Manhattan distances between two points
point1 = [[1, 2]]
point2 = [[3, 4]]

euclidean_dist = euclidean_distances(point1, point2)
manhattan_dist = manhattan_distances(point1, point2)

print(f"Euclidean Distance: {euclidean_dist[0][0]:.2f}")
print(f"Manhattan Distance: {manhattan_dist[0][0]:.2f}")

# 10. What is the role of feature scaling in KNN?
# Feature scaling is crucial in KNN because the algorithm relies on distance calculations.
# Features with larger ranges can dominate the distance computation, leading to biased results.
# Common scaling methods include Min-Max scaling or Standardization (Z-score normalization).

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Applying KNN after scaling features
knn_scaled = KNeighborsClassifier(n_neighbors=3)
knn_scaled.fit(X_train_scaled, y_train)
scaled_accuracy = knn_scaled.score(X_test_scaled, y_test)
print(f"Accuracy of KNN with scaled features: {scaled_accuracy:.2f}")
