In [None]:
# Imports and plotting backend

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from covariance import Clustering, PCAAnalysis, SubGroupAnalysis
from data_pre_processing import DataLoader, Preprocessor

In [None]:
# Load and clean full data
# directory where all your csv files are stored
data_dir = r"/path/to/data"

loader = DataLoader(data_dir)
df_full = loader.load_and_clean_data()

print(df_full.shape)
df_full.head()

In [None]:
# Prepare scaler preview
pre = Preprocessor(df_full)
data_scaled, col_names = pre.scale_data()
print("Scaled shape:", data_scaled.shape)
print("Columns:", list(col_names))

In [None]:
# Run KMeans on full data
clusterer = Clustering()
df_kmeans_full = clusterer.run_kmeans(df_full, n_clusters=3)
df_kmeans_full

In [None]:
# KMeans elbow (full data)
# compute WCSS for k=1..10 (adjust max_k if you want)
wcss = clusterer.kmeans_elbow(df_full, max_k=10)

ks = np.arange(1, len(wcss) + 1)
plt.figure(figsize=(8,4))
plt.plot(ks, wcss, marker='o', linestyle='-')
plt.xticks(ks)
plt.xlabel("Number of clusters (k)")
plt.ylabel("WCSS / Inertia")
plt.title("K-means Elbow - Full Data")
plt.grid(axis='y', linestyle=':', linewidth=0.5)
for i,v in enumerate(wcss):
    plt.text(ks[i], v, f"{v:.1f}", fontsize=8, va='bottom', ha='center')
plt.show()

In [None]:
# Run Hierarchical Clustering (AHC) on full data
df_ahc_full = clusterer.run_ahc(df_full, n_clusters=3)
df_ahc_full

In [None]:
# Dendrogram (full data)
# uses the class method which calls scipy.linkage and plt.show()
clusterer.plot_dendrogram(df_full, title="Dendrogram - Full Data")

In [None]:
# Run PCA on full data
df_edit, data_scaled_full = clusterer._prepare_and_scale(df_full)
pca_full = PCAAnalysis(data_scaled_full, list(df_edit.columns))
pca_scores = pca_full.fit_pca(n_components=2)

pca_scores

In [None]:
# Plot PCA Scree Plot and Biplot
pca_full.plot_scree()
pca_full.plot_biplot()

In [None]:
# Set segmentation rules

labels = ["RAP < 0", "0 ≤ RAP ≤ 0.4", "RAP > 0.4"]
breakpoints = (0, 0.4)

subgroup = SubGroupAnalysis()

In [None]:
# Run segmented KMeans

df_seg_kmeans = subgroup.clustering_by_segments(
    df=df_full,
    labels=labels,
    breakpoints=breakpoints,
    method="kmeans",
    method_args={"n_clusters": 3}
)

df_seg_kmeans

In [None]:
# Run segmented AHC

df_seg_ahc = subgroup.clustering_by_segments(
    df=df_full,
    labels=labels,
    breakpoints=breakpoints,
    method="ahc",
    method_args={"n_clusters": 3}
)

df_seg_ahc

In [None]:
# KMeans elbow + dendrogram for each segment

max_k = 8   # adjust as needed

for i, seg in enumerate(segments):
    label = labels[i]
    print("\n--- Segment:", label, "| rows:", seg.shape[0], "cols:", seg.shape[1], "---")
    if seg.empty:
        print("Segment empty. Skipping.")
        continue

    # 1) KMeans elbow for the segment
    try:
        wcss_seg = clusterer.kmeans_elbow(seg, max_k=max_k)
        ks = np.arange(1, len(wcss_seg) + 1)
        plt.figure(figsize=(7,3.5))
        plt.plot(ks, wcss_seg, marker='o', linestyle='-')
        plt.xticks(ks)
        plt.xlabel("k")
        plt.ylabel("WCSS")
        plt.title(f"K-means Elbow - {label}")
        plt.grid(axis='y', linestyle=':', linewidth=0.5)
        for idx, val in enumerate(wcss_seg):
            plt.text(ks[idx], val, f"{val:.1f}", fontsize=8, va='bottom', ha='center')
        plt.show()
    except Exception as e:
        print("Elbow error:", e)

    # 2) Dendrogram for the segment
    try:
        clusterer.plot_dendrogram(seg, title=f"Dendrogram - {label}")
    except Exception as e:
        print("Dendrogram error:", e)


In [None]:
# Run segmented PCA

df_seg_pca = subgroup.clustering_by_segments(
    df=df_full,
    labels=labels,
    breakpoints=breakpoints,
    method="pca"
)

df_seg_pca

In [None]:
# 1. Load and clean data
data_dir = r"data_path_here"
loader = DataLoader(data_dir)
df_all = loader.load_and_clean_data()

# 2. Preprocess and scale
preprocessor = Preprocessor(df_all)
data_scaled, columns = preprocessor.scale_data()

# 3. Hierarchical clustering
cluster = Clustering(data_scaled, columns)
linkage_matrix, cop_corr = cluster.hierarchical_clustering()
print("Cophenetic Correlation:", cop_corr)

# 4. KMeans Elbow
wcss = cluster.kmeans_elbow(max_k=4)
plt.figure()
plt.plot(range(1,5), wcss, marker='o', linestyle='--')
plt.xlabel("k")
plt.ylabel("WCSS")
plt.title("Elbow Method")
plt.show()

# 5. PCA Analysis
pca_analysis = PCAAnalysis(data_scaled, columns)
pca_analysis.fit_pca()
pca_analysis.plot_scree(save_path=r"plots/PCA/Scree_plot")
pca_analysis.plot_biplot(save_path=r"plots/PCA/PCA_biplot")