In [None]:
# Q1. What is the main difference between the Euclidean distance metric and the Manhattan distance metric in KNN?
# - Euclidean distance is the straight-line distance between two points and is computed using the Pythagorean theorem.
# - Manhattan distance (also known as L1 distance) is the sum of the absolute differences between the coordinates.
# The main difference is that Euclidean distance accounts for the diagonal distance in multi-dimensional space,
# while Manhattan distance only considers horizontal and vertical movement.
#
# How might this difference affect the performance of a KNN classifier or regressor?
# - Euclidean distance may work better when the relationship between features is continuous and not constrained to
# grid-like structures.
# - Manhattan distance may work better in high-dimensional or grid-like spaces, especially when the features are
# independent and when there are many outliers.

from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances

# Example of how to compute Euclidean and Manhattan distance
point1 = [[1, 2]]
point2 = [[3, 4]]

euclidean_dist = euclidean_distances(point1, point2)
manhattan_dist = manhattan_distances(point1, point2)

print(f"Euclidean Distance: {euclidean_dist[0][0]:.2f}")
print(f"Manhattan Distance: {manhattan_dist[0][0]:.2f}")


# Q2. How do you choose the optimal value of k for a KNN classifier or regressor?
# The optimal value of k can be chosen by:
# 1. Cross-validation: Using techniques like k-fold cross-validation to test different values of k and
# selecting the one that minimizes the validation error.
# 2. Elbow method: Plotting the error rate for different values of k and selecting the value where the
# error rate stabilizes.

from sklearn.model_selection import cross_val_score

# Perform cross-validation for different k values to choose the optimal k
import numpy as np
k_values = range(1, 21)
cv_scores = []

# Iterate over different k values
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(np.mean(scores))

# Plot the cross-validation scores
import matplotlib.pyplot as plt
plt.plot(k_values, cv_scores)
plt.xlabel('k Value')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Optimal K Value Selection')
plt.show()

# Q3. How does the choice of distance metric affect the performance of a KNN classifier or regressor?
# - The choice of distance metric determines how "closeness" between instances is measured. Different metrics
# can perform better or worse depending on the nature of the data:
#   - Euclidean distance works well when features are continuous and similar in scale.
#   - Manhattan distance is often preferred when features are not as continuous or in grid-like structures.
#   - Other metrics (e.g., Minkowski, Cosine similarity) can be used in different problem settings.
# You might choose Euclidean distance for problems with naturally continuous data (like image recognition),
# while Manhattan distance might be better for problems with independent features or categorical data (e.g.,
# in high-dimensional spaces).

# Q4. What are some common hyperparameters in KNN classifiers and regressors, and how do they affect
# the performance of the model? How might you go about tuning these hyperparameters to improve model performance?
# Common hyperparameters:
# 1. k (number of neighbors): Determines how many neighbors to consider for classification or regression.
#    A small k can lead to overfitting, and a large k can lead to underfitting.
# 2. Weights: Determines how much influence each neighbor has (uniform or distance-based).
# 3. Distance metric: Affects how the distances between points are computed (Euclidean, Manhattan, etc.).
# 4. Algorithm: Can be 'auto', 'ball_tree', 'kd_tree', or 'brute' for nearest neighbor search.
# Tuning these hyperparameters can be done using cross-validation (GridSearchCV or RandomizedSearchCV).

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(f"Best hyperparameters: {grid_search.best_params_}")


# Q5. How does the size of the training set affect the performance of a KNN classifier or regressor?
# The performance of KNN improves with a larger training set because more neighbors are available for prediction,
# leading to better generalization. However, increasing the training set size can also increase the computational
# complexity and training time.
#
# Techniques to optimize the size of the training set:
# 1. Sampling: Use techniques like stratified sampling or random sampling to ensure a diverse set of data.
# 2. Dimensionality reduction: Using methods like PCA to reduce the number of features and thus speed up the training.
# 3. Approximate nearest neighbor search: For large datasets, using KD-trees, Ball-trees, or Approximate Nearest
# Neighbor (ANN) search can speed up the KNN prediction process.

# Q6. What are some potential drawbacks of using KNN as a classifier or regressor?
# How might you overcome these drawbacks to improve the performance of the model?
# Drawbacks of KNN:
# 1. Computational cost: As KNN is a lazy learner, the training phase is fast, but making predictions can be slow,
# especially for large datasets.
# 2. Sensitivity to irrelevant features: KNN can be sensitive to noise and irrelevant features.
# 3. Curse of dimensionality: Performance degrades in high-dimensional spaces.
#
# Solutions:
# 1. Use dimensionality reduction techniques like PCA (Principal Component Analysis).
# 2. Use feature selection to identify and retain the most important features.
# 3. Use faster nearest-neighbor search techniques like KD-trees or Ball-trees.
# 4. Optimize the choice of k and distance metrics through hyperparameter tuning.

from sklearn.decomposition import PCA

# Apply PCA for dimensionality reduction before using KNN
pca = PCA(n_components=2)  # Reducing to 2 components
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

knn_pca = KNeighborsClassifier(n_neighbors=5)
knn_pca.fit(X_train_pca, y_train)
accuracy_pca = knn_pca.score(X_test_pca, y_test)

print(f"Accuracy of KNN with PCA: {accuracy_pca:.2f}")
