In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
columns = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
           'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
           'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

data = pd.read_csv(url, header=None, names=['Target'] + columns)
print(data.head())

# Step 2: Split the dataset into features and target variables
X = data.drop('Target', axis=1)
y = data['Target']

# Step 3: Data preprocessing
# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Implement PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Determine the optimal number of principal components to retain
explained_variance = np.cumsum(pca.explained_variance_ratio_)
optimal_components = np.argmax(explained_variance >= 0.95) + 1

print(f"Optimal number of components to retain: {optimal_components}")

# Plot the explained variance ratio
plt.figure(figsize=(10, 7))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Principal Components')
plt.axvline(x=optimal_components, color='r', linestyle='--')
plt.show()

# Retain optimal number of principal components
pca = PCA(n_components=optimal_components)
X_pca_optimal = pca.fit_transform(X_scaled)

# Step 5: Visualize PCA results
plt.figure(figsize=(10, 7))
plt.scatter(X_pca_optimal[:, 0], X_pca_optimal[:, 1], c=y, cmap='viridis', edgecolor='k', s=100)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Result: Wine Dataset')
plt.colorbar(label='Target')
plt.show()

# Step 6: Perform clustering on the PCA-transformed data using K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_pca_optimal)

# Visualize the clusters
plt.figure(figsize=(10, 7))
plt.scatter(X_pca_optimal[:, 0], X_pca_optimal[:, 1], c=clusters, cmap='viridis', edgecolor='k', s=100)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('K-Means Clustering on PCA-Transformed Data')
plt.colorbar(label='Cluster')
plt.show()

# Step 7: Interpretation
print("Explained Variance Ratio by Principal Components:")
print(pca.explained_variance_ratio_)

print("\nK-Means Clustering Performance:")
print(f"Inertia: {kmeans.inertia_}")
print(f"Cluster Centers: {kmeans.cluster_centers_}")
