In [None]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px

In [None]:
# 1. Loading dataset
wine = load_wine()
X = wine.data
y = wine.target
feature_names = wine.feature_names
target_names = wine.target_names

In [None]:
# 2. First look at the dataset
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
print("First 5 rows of the wine dataset:")
df.head()

In [None]:
# 3. Dataset statistics
print("Statistical overview of the features")
df.describe().T

In [None]:
# 4. Standardizing the variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# 5. Calculate principal components
pca_full = PCA()
X_pca_full = pca_full.fit_transform(X_scaled)

In [None]:
# 6. Get the variance of principal components

# Get and accumulate individual variances
explained_variance = pca_full.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Print the results
print("Variance per principal components:")
for i, var_ratio in enumerate(explained_variance):
    print(f"PC{i+1}: {var_ratio:.4f} ({cumulative_variance[i]:.4f} cumulated)")

In [None]:
# 7. Scree plot to visualize the explained variance per principal component
fig, ax = plt.subplots(figsize=(10,5))
ax.bar(range(1, len(explained_variance)+1), explained_variance, alpha=0.6, label='Individual variance')
ax.step(range(1, len(cumulative_variance)+1), cumulative_variance, where='mid', label='Cumulated variance', color='red')
ax.set_xlabel('Principal component')
ax.set_ylabel('Variance')
ax.set_title('Scree plot: Variance per principal component')
ax.legend()
ax.grid(True)
plt.show()

In [None]:
# 8. Visualizing a dual component PCA

# Perform PCA with n_components=2
pca_2 = PCA(n_components=2)
X_pca_2 = pca_2.fit_transform(X_scaled)
df_pca = pd.DataFrame(data=X_pca_2, columns=['PC1', 'PC2'])
df_pca['target'] = y

# Create a 2D scree plot
fig, ax = plt.subplots(figsize=(10,5))
colors = ['#d62728', '#2ca02c', '#1f77b4']
for i, target_name in enumerate(target_names):
    ax.scatter(df_pca[df_pca['target'] == i]['PC1'],
                df_pca[df_pca['target'] == i]['PC2'],
                label=target_name,
                color=colors[i],
                alpha=0.75)
ax.set_xlabel('First principal component')
ax.set_ylabel('Second principal component')
ax.set_title('Wine dataset: Visualizing a 2D PCA')
ax.legend()
ax.grid(True)
plt.show()

In [None]:
# 9. Visualizing a triple component PCA

# Perform PCA with n_components=3
pca_3 = PCA(n_components=3)
X_pca_3 = pca_3.fit_transform(X_scaled)

df_pca_3 = pd.DataFrame(data=X_pca_3, columns=['PC1', 'PC2', 'PC3'])
df_pca_3['target'] = y
df_pca_3['target_name'] = [target_names[i] for i in y]

# Create a 2D scree plot
color_map = {
    target_names[0]: '#d62728',
    target_names[1]: '#2ca02c',
    target_names[2]: '#1f77b4'
}

fig = px.scatter_3d(
    df_pca_3,
    x='PC1',
    y='PC2',
    z='PC3',
    color='target_name',
    color_discrete_map=color_map,
    labels={'target_name': 'Wine Class'},
    title='Wine dataset: Visualizing a 3D PCA',
    opacity=0.75,
    width=800,
    height=600,
)
fig.show()