In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Function to load and preprocess the data
def preprocess_data(file_path):
    # Read the data
    df = pd.read_csv(file_path)
    
    # Drop the 'type' column and 'quality' column (we'll use numerical features for clustering)
    X = df.drop(['type', 'quality'], axis=1)
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, df, X.columns

# Function to perform elbow method
def elbow_method(X, max_k=10):
    inertias = []
    silhouette_scores = []
    K = range(1, max_k + 1)
    
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
    
    # Plot the elbow curve
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 1, 1)
    plt.plot(K, inertias, 'bx-', linewidth=2, markersize=8, label='Inertia')
    plt.xlabel('Number of Clusters (k)', fontsize=12)
    plt.ylabel('Inertia', fontsize=12)
    plt.title('Elbow Method for Optimal k', fontsize=14)
    plt.grid(True)
    
    # Calculate the angle at each point to find the elbow
    angles = []
    for i in range(1, len(K)-1):
        point1 = np.array([K[i-1], inertias[i-1]])
        point2 = np.array([K[i], inertias[i]])
        point3 = np.array([K[i+1], inertias[i+1]])
        
        # Calculate vectors
        vector1 = point1 - point2
        vector2 = point3 - point2
        
        # Calculate angle
        cos_angle = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
        angle = np.arccos(cos_angle)
        angles.append(angle)
    
    # The elbow point is where the angle is maximum
    optimal_k = angles.index(max(angles)) + 2
    
    plt.axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal k = {optimal_k}')
    plt.legend(fontsize=10)
    plt.tight_layout()
    
    return optimal_k

# Function to perform k-means clustering and analyze results
def perform_kmeans_analysis(X, n_clusters, df, feature_names):
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)
    
    # Add cluster labels to original dataframe
    df_with_clusters = df.copy()
    df_with_clusters['Cluster'] = clusters
    
    # Calculate cluster centers
    cluster_centers = pd.DataFrame(
        kmeans.cluster_centers_,
        columns=feature_names
    )
    
    # Create visualization of cluster characteristics
    plt.figure(figsize=(15, 8))
    
    # Plot cluster sizes
    plt.subplot(1, 2, 1)
    cluster_sizes = df_with_clusters['Cluster'].value_counts().sort_index()
    sns.barplot(x=cluster_sizes.index, y=cluster_sizes.values)
    plt.title('Cluster Sizes', fontsize=12)
    plt.xlabel('Cluster', fontsize=10)
    plt.ylabel('Number of Samples', fontsize=10)
    
    # Plot feature importances for each cluster
    plt.subplot(1, 2, 2)
    sns.heatmap(cluster_centers, cmap='coolwarm', center=0)
    plt.title('Cluster Centers Heatmap', fontsize=12)
    plt.xlabel('Features', fontsize=10)
    plt.ylabel('Cluster', fontsize=10)
    plt.tight_layout()
    
    return clusters, df_with_clusters, cluster_centers

# Main execution
try:
    # Load and preprocess data
    X_scaled, df, feature_names = preprocess_data('diabeties.csv')
    
    # Find optimal k using elbow method
    print("Calculating optimal number of clusters...")
    optimal_k = elbow_method(X_scaled)
    print(f"\nOptimal number of clusters (k) = {optimal_k}")
    
    # Perform clustering and analysis
    print("\nPerforming K-means clustering and analysis...")
    clusters, df_with_clusters, cluster_centers = perform_kmeans_analysis(
        X_scaled, optimal_k, df, feature_names
    )
    
    # Print summary statistics
    print("\nCluster Summary:")
    print("-" * 50)
    print("\nCluster Sizes:")
    print(df_with_clusters['Cluster'].value_counts().sort_index())
    
    print("\nCluster Centers (standardized values):")
    print(cluster_centers)
    
    # Save results
    df_with_clusters.to_csv('wine_clusters_results.csv', index=False)
    print("\nResults have been saved to 'wine_clusters_results.csv'")
    
    plt.show()

except FileNotFoundError:
    print("Error: Could not find 'diabeties.csv'. Please make sure the file is in the current directory.")
except Exception as e:
    print(f"An error occurred: {str(e)}")