In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import config

Plot the diversity of the source in a PCA plot

In [5]:
run = "case1_run2"

symbol_map = {"div0": "circle", "div5": "square", "div10": "diamond"}
color_map = {"div0": "blue", "div5": "red", "div10": "green"}

# load CONCAT features
feats = ['abs_energy','intermittency','mean','median','kurtosis','skewness',
         'standard_deviation','agg_autocorrelation_max','erraticness','agg_linear_trend_slope']
concat = pd.read_parquet(f"{config.path_to_data}/{run}/interim/CONCAT_feat.parquet")[['id_ts', 'data'] + feats]

all_sources = []

for div in ["10", "5", "0"]:
    source = pd.read_parquet(f"{config.path_to_data}/{run}/interim/source_div{div}{run}/source_div{div}{run}.parquet")
    source_feat = concat[concat.id_ts.isin(source.id_ts.unique())]
    source_feat["div"] = f"div{div}"
    all_sources.append(source_feat)

combined_source = pd.concat(all_sources, ignore_index=True)

# Sampling for efficient evaluation
sample_size_per_div = 4000 
sampled_data = []

for div_value in ["div0", "div5", "div10"]:
    subset = combined_source[combined_source["div"] == div_value]
    sample_n = min(sample_size_per_div, len(subset))
    sampled_subset = subset.sample(n=sample_n, random_state=42)
    sampled_data.append(sampled_subset)

sampled_combined = pd.concat(sampled_data, ignore_index=True)

# Scale on sampled data
scaler = StandardScaler()
features_scaled = pd.DataFrame(
    scaler.fit_transform(sampled_combined[feats]),
    columns=feats
)

pca_meth = PCA(n_components=3)
pca_result = pca_meth.fit_transform(features_scaled)
print("Explained PCA variance ratio is " + str(pca_meth.explained_variance_ratio_))

pca_df_3d = pd.DataFrame(pca_result, columns=["PC1", "PC2", "PC3"])
pca_df_3d["Div"] = sampled_combined["div"].values

legend_names = {"div0": "Least", "div5": "Median", "div10": "Most"}

# 3d pca
fig_pca_3d = go.Figure()
for div in ["div10", "div5", "div0"]:
    subset = pca_df_3d[pca_df_3d["Div"] == div]
    
    fig_pca_3d.add_trace(go.Scatter(
        x=subset["PC2"],
        y=subset["PC3"],
        mode='markers',
        marker=dict(
            symbol=symbol_map[div],
            color=color_map[div],
            size=8,
            opacity=0.6,
            line=dict(width=0.5, color='white')
        ),
        name=legend_names[div],
        legendgroup=div,
        showlegend=True
    ))

fig_pca_3d.update_layout(
    xaxis=dict(
        title=dict(
            text=f"PC2",
            font=dict(size=20)
        ),
        tickfont=dict(size=18)
    ),
    yaxis=dict(
        title=dict(
            text=f"PC3",
            font=dict(size=20)
        ),
        tickfont=dict(size=18)
    ),
    legend=dict(
        title=dict(text="Source Diversity", font=dict(size=20)),
        font=dict(size=18),
        itemsizing='constant'  
    ),
    width=900,
    height=700
)

# saving requires the package kaleido
#fig_pca_3d.write_image(config.path_to_evaluation + "/combined_source_pca_23.png")
#fig_pca_3d.write_image(config.path_to_evaluation + "/combined_source_pca_23.pdf")

# showing the pca plot requires nbformat
#fig_pca_3d.show()

print("\nExplained Variance (PCA):")
for i, ratio in enumerate(pca_meth.explained_variance_ratio_):
    print(f"PC{i+1}: {ratio:.3f}")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Explained PCA variance ratio is [0.4126743  0.29429556 0.0993607 ]

Explained Variance (PCA):
PC1: 0.413
PC2: 0.294
PC3: 0.099
