In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import datasets

# Load Wine dataset from UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00336/wine.data"
columns = ["Class", "Alcohol", "Malic Acid", "Ash", "Alcalinity of Ash", "Magnesium",
           "Total Phenols", "Flavanoids", "Nonflavanoid Phenols", "Proanthocyanins",
           "Color Intensity", "Hue", "OD280/OD315 of Diluted Wines", "Proline"]

# Read the dataset
wine_data = pd.read_csv(url, header=None, names=columns)

# Show the first few rows of the dataset
wine_data.head()

# Separate features and target variable
X = wine_data.drop("Class", axis=1)
y = wine_data["Class"]

# Step 1: Data Preprocessing - Standardize the data (feature scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply PCA - Reduce dimensions
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Step 3: Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Plot explained variance ratio to choose optimal number of components
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.title('Explained Variance Ratio by Principal Component')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.show()

# Step 4: Determine the optimal number of components
cumulative_variance = np.cumsum(explained_variance_ratio)
optimal_components = np.argmax(cumulative_variance >= 0.95) + 1  # 95% explained variance
print(f"Optimal number of components: {optimal_components}")

# Step 5: Transform the data using the optimal number of components
pca = PCA(n_components=optimal_components)
X_pca_optimal = pca.fit_transform(X_scaled)

# Step 6: Visualize the PCA-transformed data
plt.figure(figsize=(8, 6))
plt.scatter(X_pca_optimal[:, 0], X_pca_optimal[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.title('PCA of Wine Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Class')
plt.show()

# Step 7: Perform K-Means clustering on the PCA-transformed data
kmeans = KMeans(n_clusters=3, random_state=42)
y_kmeans = kmeans.fit_predict(X_pca_optimal)

# Step 8: Visualize the clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca_optimal[:, 0], X_pca_optimal[:, 1], c=y_kmeans, cmap='viridis', edgecolor='k', s=50)
plt.title('K-Means Clustering on PCA-transformed Wine Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()

# Step 9: Performance Metrics - Silhouette Score
sil_score = silhouette_score(X_pca_optimal, y_kmeans)
print(f"Silhouette Score for K-Means clustering: {sil_score:.4f}")
