In [None]:
# Q1. What is the curse of dimensionality reduction and why is it important in machine learning?
# No specific code, this is more about understanding.
# But here's an example of how high-dimensional data increases sparsity:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

# Generate a synthetic high-dimensional dataset
X, y = make_classification(n_samples=100, n_features=50, n_informative=10, n_clusters_per_class=1, random_state=42)

# Plot the data in 2D to show sparsity
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.title('High-dimensional data visualized in 2D')
plt.show()

# Q2. How does the curse of dimensionality impact the performance of machine learning algorithms?
# Example of using KNN classifier on high-dimensional data and seeing the effect on accuracy

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification

# Generate a synthetic dataset with many features
X, y = make_classification(n_samples=100, n_features=50, n_informative=10, n_clusters_per_class=1, random_state=42)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict and calculate accuracy
y_pred = knn.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

# Q3. What are some of the consequences of the curse of dimensionality in machine learning, and how do they impact model performance?
# Overfitting example with high-dimensional data in KNN

X, y = make_classification(n_samples=100, n_features=50, n_informative=10, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Use KNN with a low number of neighbors to simulate overfitting
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

# Predict and calculate accuracy
y_pred = knn.predict(X_test)
print(f'Accuracy with Overfitting (n_neighbors=1): {accuracy_score(y_test, y_pred)}')

# Q4. Can you explain the concept of feature selection and how it can help with dimensionality reduction?

# Example using Recursive Feature Elimination (RFE) for feature selection

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

# Load the dataset
X, y = load_iris(return_X_y=True)

# Create the base model
model = LogisticRegression(max_iter=200)

# Select features using RFE
selector = RFE(model, n_features_to_select=2)
X_selected = selector.fit_transform(X, y)

print(f'Selected features: {selector.support_}')

# Q5. What are some limitations and drawbacks of using dimensionality reduction techniques in machine learning?

# Example: PCA might lose important features

from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load iris dataset
X, y = load_iris(return_X_y=True)

# Apply PCA to reduce dimensions to 2
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Train a KNN model
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Accuracy after dimensionality reduction
y_pred = knn.predict(X_test)
print(f'Accuracy with PCA: {accuracy_score(y_test, y_pred)}')

# Q6. How does the curse of dimensionality relate to overfitting and underfitting in machine learning?

# Example: High-dimensional data might lead to overfitting and underfitting with KNN

from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate high-dimensional data
X, y = make_classification(n_samples=100, n_features=50, random_state=42)

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Overfitting case (low k value)
knn_overfit = KNeighborsClassifier(n_neighbors=1)
knn_overfit.fit(X_train, y_train)
y_pred_overfit = knn_overfit.predict(X_test)
print(f'Accuracy (Overfitting): {accuracy_score(y_test, y_pred_overfit)}')

# Underfitting case (high k value)
knn_underfit = KNeighborsClassifier(n_neighbors=50)
knn_underfit.fit(X_train, y_train)
y_pred_underfit = knn_underfit.predict(X_test)
print(f'Accuracy (Underfitting): {accuracy_score(y_test, y_pred_underfit)}')

# Q7. How can one determine the optimal number of dimensions to reduce data to when using dimensionality reduction techniques?

# Using explained variance to determine the optimal number of dimensions with PCA

from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt

# Load iris dataset
X, y = load_iris(return_X_y=True)

# Apply PCA
pca = PCA()
pca.fit(X)

# Plot the explained variance ratio
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs Number of Components')
plt.show()
