In [None]:
# Q1. What is a projection and how is it used in PCA?
# In PCA, a projection is the process of mapping the original data points onto a lower-dimensional space.
# The data is projected onto the principal components (the eigenvectors of the covariance matrix),
# where the first component captures the most variance, and the last component captures the least variance.

from sklearn.decomposition import PCA
import numpy as np
from sklearn.datasets import load_iris

# Load the dataset
X, y = load_iris(return_X_y=True)

# Apply PCA to reduce the data to 2 dimensions
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

print("Original shape of data:", X.shape)
print("Shape after PCA projection:", X_pca.shape)

# Q2. How does the optimization problem in PCA work, and what is it trying to achieve?
# PCA seeks to solve the optimization problem where it tries to maximize the variance (spread of data)
# in the projection space by finding the principal components (the eigenvectors of the covariance matrix).
# The first principal component maximizes the variance, and each subsequent component is orthogonal and captures the next highest variance.

from sklearn.decomposition import PCA

# Applying PCA to reduce to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Q3. What is the relationship between covariance matrices and PCA?
# In PCA, the covariance matrix represents how the features vary with respect to each other.
# The eigenvectors of the covariance matrix give the directions of maximum variance, and the eigenvalues give the magnitude of the variance in those directions.

# Calculating the covariance matrix
cov_matrix = np.cov(X, rowvar=False)
print("Covariance Matrix:\n", cov_matrix)

# Q4. How does the choice of number of principal components impact the performance of PCA?
# The number of components determines how much variance is retained in the reduced dimensionality.
# If too few components are chosen, important information may be lost. If too many are chosen,
# the data may not be sufficiently reduced, leading to overfitting.

# Show explained variance ratio
pca = PCA()
pca.fit(X)

print("Explained Variance Ratio per component:", pca.explained_variance_ratio_)

# Q5. How can PCA be used in feature selection, and what are the benefits of using it for this purpose?
# PCA can be used for feature selection by selecting the top components that explain most of the variance in the data.
# By reducing the number of features, we reduce dimensionality, computational cost, and potentially overfitting.

# Q6. What are some common applications of PCA in data science and machine learning?
# PCA is used in dimensionality reduction, noise reduction, data visualization, and in improving the performance of machine learning models by reducing overfitting.

# Example of using PCA for data visualization
import matplotlib.pyplot as plt

# Using the first two principal components for visualization
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)
plt.title('PCA: Iris dataset projected to 2D')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

# Q7. What is the relationship between spread and variance in PCA?
# In PCA, variance is a measure of the spread of the data. The more spread the data has along a particular direction,
# the higher the variance in that direction, and that direction becomes a principal component.

# Q8. How does PCA use the spread and variance of the data to identify principal components?
# PCA identifies principal components by finding the directions of maximum spread (variance) in the data.
# The first component captures the direction of the largest variance, the second captures the next largest, and so on.

# Q9. How does PCA handle data with high variance in some dimensions but low variance in others?
# PCA handles this by giving more importance to dimensions with high variance (through the corresponding eigenvalues),
# while dimensions with low variance contribute less to the principal components.

# Visualize explained variance ratio to observe how PCA handles the variance
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title('Cumulative Explained Variance by Principal Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()
