# 04 â€” Stock Clustering

Applies K-Means and Gaussian Mixture Model clustering on PCA-reduced features.

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.clustering import StockClusterer

sns.set_style('whitegrid')
pd.set_option('display.max_columns', 100)
%matplotlib inline

## 1. Load Data

In [None]:
pca_data = pd.read_parquet("../data/processed/pca_data.parquet")
feature_matrix = pd.read_parquet("../data/processed/feature_matrix.parquet")
fundamentals_df = pd.read_parquet("../data/raw/fundamentals.parquet")

if 'sector' in fundamentals_df.columns:
    sectors = fundamentals_df.loc[pca_data.index, 'sector'].fillna("Unknown")
else:
    sectors = pd.Series("Unknown", index=pca_data.index)

print(f"PCA data: {pca_data.shape}")
print(f"Feature matrix: {feature_matrix.shape}")

## 2. K-Means Clustering

In [None]:
clusterer = StockClusterer(max_clusters=10)
kmeans_metrics = clusterer.evaluate_kmeans(pca_data)
clusterer.plot_elbow_and_silhouette(save=True)
print("K-Means evaluation complete")

## 3. Fit Optimal K-Means

In [None]:
kmeans_labels = clusterer.fit_kmeans(pca_data)
print(f"Cluster distribution:\n{kmeans_labels.value_counts().sort_index()}")

## 4. Visualize K-Means Clusters

In [None]:
clusterer.plot_cluster_scatter(pca_data, kmeans_labels, title="K-Means Clusters in PCA Space", save=True)

## 5. Cluster Composition

In [None]:
clusterer.plot_cluster_composition(kmeans_labels, sectors, save=True)
cluster_summary = clusterer.get_cluster_summary(kmeans_labels, feature_matrix, sectors)
print(cluster_summary)

## 6. Gaussian Mixture Model Clustering

In [None]:
gmm_metrics = clusterer.evaluate_gmm(pca_data)
clusterer.plot_gmm_evaluation(save=True)
print("GMM evaluation complete")

## 7. Fit GMM

In [None]:
gmm_labels, gmm_probs = clusterer.fit_gmm(pca_data)
print(f"GMM cluster distribution:\n{gmm_labels.value_counts().sort_index()}")
clusterer.plot_cluster_scatter(pca_data, gmm_labels, title="GMM Clusters in PCA Space", save=True, filename="gmm_cluster_scatter.png")

## 8. Compare K-Means vs GMM

In [None]:
comparison = pd.DataFrame({"KMeans": kmeans_labels, "GMM": gmm_labels})
print("Cross-tabulation of K-Means vs GMM:")
print(pd.crosstab(comparison["KMeans"], comparison["GMM"]))

## 9. Save Results

In [None]:
cluster_results = pd.DataFrame({"kmeans_cluster": kmeans_labels, "gmm_cluster": gmm_labels}, index=pca_data.index)
cluster_results.to_parquet("../data/processed/cluster_labels.parquet")
print(f"Saved cluster labels: {cluster_results.shape}")