In [None]:
# Step 1: Data Preparation and Exploration
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load dataset
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target, name='target')

# Explore dataset
print("First 5 rows of the dataset:")
print(X.head(), "\n")

print("Dataset info:")
print(X.info(), "\n")

print("Dataset description:")
print(X.describe(), "\n")

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Features standardized.\n")


# Step 2: Hierarchical Clustering
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA

# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Agglomerative Clustering
n_clusters_list = [2, 3, 4]
for n_clusters in n_clusters_list:
    hier_clust = AgglomerativeClustering(n_clusters=n_clusters)
    labels_hier = hier_clust.fit_predict(X_scaled)
    
    plt.figure(figsize=(6,4))
    plt.scatter(X_pca[:,0], X_pca[:,1], c=labels_hier, cmap='rainbow')
    plt.title(f'Hierarchical Clustering (n_clusters={n_clusters})')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.show()

# Dendrogram
linked = linkage(X_scaled, method='ward')
plt.figure(figsize=(12,6))
dendrogram(linked, truncate_mode='level', p=5)
plt.title('Dendrogram - Hierarchical Clustering')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()


# Step 3: DBSCAN Clustering
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score

# DBSCAN parameter tuning
eps_values = [0.8, 1.5, 2.0]
min_samples_values = [3, 5, 7]

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels_db = dbscan.fit_predict(X_scaled)
        
        # Count clusters and noise
        n_clusters = len(set(labels_db)) - (1 if -1 in labels_db else 0)
        n_noise = list(labels_db).count(-1)
        print(f"DBSCAN eps={eps}, min_samples={min_samples}:")
        print(f"Clusters: {n_clusters}, Noise points: {n_noise}")
        
        # Visualization
        plt.figure(figsize=(6,4))
        plt.scatter(X_pca[:,0], X_pca[:,1], c=labels_db, cmap='rainbow')
        plt.title(f'DBSCAN Clustering (eps={eps}, min_samples={min_samples})')
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.show()
        
        # Evaluation Metrics (ignore noise for silhouette score)
        mask = labels_db != -1
        if sum(mask) > 1 and len(set(labels_db[mask])) > 1:
            sil_score = silhouette_score(X_scaled[mask], labels_db[mask])
            print(f"Silhouette Score: {sil_score:.3f}")
        else:
            print("Silhouette Score: Not defined (too few clusters)")
            
        hom_score = homogeneity_score(y, labels_db)
        comp_score = completeness_score(y, labels_db)
        print(f"Homogeneity Score: {hom_score:.3f}")
        print(f"Completeness Score: {comp_score:.3f}\n")


# Step 4: Analysis and Insights
print("### Analysis and Insights ###")
print("- Hierarchical Clustering allows clear visualization of nested clusters via dendrograms.")
print("- DBSCAN detects arbitrary-shaped clusters and identifies noise points.")
print("- DBSCAN results are highly sensitive to eps and min_samples parameters.")
print("- PCA was used for visualization to reduce the dimensionality to 2D for plotting purposes.")