# Exercise 1 - Week 5
## PCA on Wine Recognition Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load dataset
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target, name='wine_class')

# Basic info
print("Shape:", X.shape)
print("Feature names:", wine.feature_names)
print("Class distribution:
", y.value_counts())


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print("Principal Components:")
print(pca.components_)
print("Explained Variance Ratio:", pca.explained_variance_ratio_)


In [None]:
# 2D Scatter plot
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y, palette='Set1')
plt.title('PCA - Wine Recognition (2D)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()


In [None]:
# Explained variance for all components
pca_full = PCA()
pca_full.fit(X_scaled)
explained_var = pca_full.explained_variance_ratio_
cum_var = np.cumsum(explained_var)

plt.figure(figsize=(8,5))
plt.plot(range(1, len(explained_var)+1), cum_var, marker='o')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance')
plt.axhline(y=0.95, color='r', linestyle='--')
plt.grid(True)
plt.show()

print("Number of components to retain 95% variance:",
      np.argmax(cum_var >= 0.95) + 1)


In [None]:
from mpl_toolkits.mplot3d import Axes3D

# PCA to 3 components
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_scaled)

# 3D scatter plot
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_pca_3d[:,0], X_pca_3d[:,1], X_pca_3d[:,2], c=y, cmap='Set1')
ax.set_title('PCA - Wine Recognition (3D)')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.legend(*scatter.legend_elements(), title="Class")
plt.show()


## Inference

- The PCA reduced the 13-dimensional dataset to 2 and 3 dimensions effectively.
- The first 2 components explained a significant portion of the variance.
- To retain at least 95% of the variance, we need X components (as printed above).
- In 2D and 3D scatter plots, clear separation between wine classes can be observed, indicating PCA's success in dimensionality reduction while preserving class separability.
