In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# Load and preprocess data
df = pd.read_csv("sales_data_sample.csv", encoding='latin-1')
X = df[['QUANTITYORDERED', 'PRICEEACH', 'SALES']].dropna()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Show head of DataFrame
print(X.head())

# Elbow Method for optimal k
wcss = [KMeans(n_clusters=k, random_state=42).fit(X_scaled).inertia_ for k in range(1, 11)]

plt.figure(figsize=(8, 4))
plt.plot(range(1, 11), wcss, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()

# Apply K-Means with optimal_k = 3 (as per Elbow plot observation)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
X['KMeans_Cluster'] = kmeans.fit_predict(X_scaled)

# Plot K-Means Clusters with Centroids
centroids_scaled = kmeans.cluster_centers_
centroids_original = scaler.inverse_transform(centroids_scaled)

plt.figure(figsize=(15, 12))
plt.scatter(X['QUANTITYORDERED'], X['SALES'], c=X['KMeans_Cluster'], cmap='viridis')
plt.scatter(centroids_original[:, 0], centroids_original[:, 2],
            s=300, c='red', marker='X', label='Centroids')

plt.xlabel('QUANTITYORDERED')
plt.ylabel('SALES')
plt.title('K-Means Clustering with Centroids')
plt.legend()
plt.grid(True)
plt.show()


ModuleNotFoundError: No module named 'pandas'