In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Task
Process each CSV in "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/" by loading it, applying the frozen PCA model from "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/pca_model.pkl" to the 12 normalized feature columns (keeping the first 3 components), applying the frozen KMeans model (K=2) from "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/kmeans_k2_pca_model.pkl" to assign a new cluster, comparing the new cluster to the original cluster assignments from "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/full_time_series_cluster_assignments_pca.csv" (using `lv_feeder_unique_id` and the `cluster_k2_pca` column) to create a `moved_cluster` flag, and calculating the `distance_to_centroid` for each feeder in the 3D PCA space.

## List csv files

### Subtask:
Get a list of all simulated feature matrix CSV files in the specified folder.


**Reasoning**:
Import the `os` module and define the directory path. Then list and filter the files to get only the CSV files.



In [2]:
import os

simulated_data_dir = "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/"
all_files = os.listdir(simulated_data_dir)
csv_files = [f for f in all_files if f.endswith(".csv")]

print(csv_files)


import numpy as np

['features_unmetered_load_0.05.csv', 'features_unmetered_load_0.1.csv', 'features_unmetered_load_0.2.csv', 'features_offpeak_exploitation_0.05.csv', 'features_offpeak_exploitation_0.1.csv', 'features_offpeak_exploitation_0.2.csv', 'features_gradual_drift_0.05.csv', 'features_gradual_drift_0.1.csv', 'features_gradual_drift_0.2.csv', 'features_smoothed_profile_0.05.csv', 'features_smoothed_profile_0.1.csv', 'features_smoothed_profile_0.2.csv', 'features_flatline_consumption_5.csv', 'features_flatline_consumption_10.csv', 'features_flatline_consumption_20.csv']


## Iterate through csvs

### Subtask:
Loop through each CSV file found in the previous step.


**Reasoning**:
Start a loop to iterate through the list of CSV files and construct the full path for each file.



In [3]:
import os

for csv_file in csv_files:
    current_csv_path = os.path.join(simulated_data_dir, csv_file)
    print(f"Processing file: {current_csv_path}")

Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_unmetered_load_0.05.csv
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_unmetered_load_0.1.csv
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_unmetered_load_0.2.csv
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_offpeak_exploitation_0.05.csv
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_offpeak_exploitation_0.1.csv
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_offpeak_exploitation_0.2.csv
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_gradual_drift_0.05.csv


## Load data and models

### Subtask:
Inside the loop, load the current simulated feature matrix CSV, the frozen PCA model, the frozen KMeans model (K=2), and the original cluster assignments.


**Reasoning**:
Load the current simulated feature matrix CSV, the frozen PCA model, the frozen KMeans model (K=2), and the original cluster assignments inside the loop.



In [4]:
import pandas as pd
import pickle
import os
import numpy as np

for csv_file in csv_files:
    current_csv_path = os.path.join(simulated_data_dir, csv_file)
    print(f"Processing file: {current_csv_path}")

    # 1. Load the current simulated feature matrix CSV
    simulated_df = pd.read_csv(current_csv_path)
    print(f"Loaded {current_csv_path}")

    # 2. Load the frozen PCA model
    pca_model_path = "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/pca_model.pkl"
    with open(pca_model_path, 'rb') as f:
        pca_model = pickle.load(f)

    # 3. Load the frozen KMeans model (K=2)
    kmeans_model_path = "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/kmeans_k2_pca_model.pkl"
    with open(kmeans_model_path, 'rb') as f:
        kmeans_model = pickle.load(f)

    centroids = kmeans_model.cluster_centers_ # Centroids are from the KMeans model

    # 4. Load the original cluster assignments and calculate original distance to centroid
    original_clusters_path = "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/full_time_series_cluster_assignments_pca.csv"
    original_clusters_df = pd.read_csv(original_clusters_path)

    # Calculate distance to centroid for the original data
    def calculate_distance_original(row):
        cluster_id = row['cluster_k2_pca']
        centroid = centroids[cluster_id]
        point = row[['PC1', 'PC2', 'PC3']].values
        return np.linalg.norm(point - centroid)

    # Ensure 'cluster_k2_pca', 'PC1', 'PC2', 'PC3' are in original_clusters_df before calculation
    # Assuming these columns are present in the baseline file based on the original task description
    # If not, these columns would need to be generated for the baseline data as well.
    if all(col in original_clusters_df.columns for col in ['cluster_k2_pca', 'PC1', 'PC2', 'PC3']):
         original_clusters_df['distance_to_centroid'] = original_clusters_df.apply(calculate_distance_original, axis=1)
    else:
         print("Warning: Original PCA and cluster columns not found in baseline file. Cannot calculate original distance to centroid.")
         original_clusters_df['distance_to_centroid'] = np.nan # Add column with NaNs if data is missing


    # Prepare original distance and PCA subsets with explicit column names for merging
    original_distance_subset = original_clusters_df[['lv_feeder_unique_id', 'distance_to_centroid']].rename(columns={'distance_to_centroid': 'distance_to_centroid_original'})
    original_pca_subset = original_clusters_df[['lv_feeder_unique_id', 'PC1', 'PC2', 'PC3']].rename(columns={'PC1': 'PC1_original', 'PC2': 'PC2_original', 'PC3': 'PC3_original'})

    # Combine original distance and PCA subsets
    original_combined_df = pd.merge(original_distance_subset, original_pca_subset, on='lv_feeder_unique_id', how='left')


    features_for_pca = simulated_df[[
        'avg_daily_consumption_per_device',
        'std_daily_consumption_per_device',
        'avg_total_consumption_daily',
        'std_total_consumption_daily',
        'avg_max_consumption_daily',
        'std_max_consumption_daily',
        'max_to_mean_ratio',
        'std_to_mean_ratio',
        'per_device_peak_ratio',
        'max_to_min_ratio',
        'weekday_weekend_ratio',
        'mean_daily_time_at_peak'
    ]]

    # Convert features to numpy array to avoid UserWarning
    features_for_pca_np = features_for_pca.values

    pca_components = pca_model.transform(features_for_pca_np)
    pca_df = pd.DataFrame(pca_components[:, :3], columns=['PC1', 'PC2', 'PC3'])

    new_cluster_labels = kmeans_model.predict(pca_df)
    pca_df['new_cluster_k2_pca'] = new_cluster_labels

    # Add lv_feeder_unique_id to pca_df for merging
    pca_df['lv_feeder_unique_id'] = simulated_df['lv_feeder_unique_id']

    # Calculate the new distance to centroid for the current data
    def calculate_distance_current(row):
        cluster_id = row['new_cluster_k2_pca']
        centroid = centroids[cluster_id]
        point = row[['PC1', 'PC2', 'PC3']].values
        return np.linalg.norm(point - centroid)

    pca_df['distance_to_centroid_current'] = pca_df.apply(calculate_distance_current, axis=1)


    # Merge current data (pca_df) with original cluster assignments and the combined original data
    # Merge current PCA, new cluster, current distance with original cluster assignments
    merged_clusters_df = pd.merge(pca_df, original_clusters_df[['lv_feeder_unique_id', 'cluster_k2_pca']], on='lv_feeder_unique_id', how='left')

    # Now merge with the combined original distance and PCA data
    merged_clusters_df = pd.merge(merged_clusters_df, original_combined_df, on='lv_feeder_unique_id', how='left')


    # Calculate moved_cluster, delta_distance_to_centroid, and pca_shift_distance
    merged_clusters_df['moved_cluster'] = (merged_clusters_df['new_cluster_k2_pca'] != merged_clusters_df['cluster_k2_pca']).fillna(False)
    merged_clusters_df['delta_distance_to_centroid'] = merged_clusters_df['distance_to_centroid_current'] - merged_clusters_df['distance_to_centroid_original']

    merged_clusters_df['pca_shift_distance'] = merged_clusters_df.apply(
        lambda row: np.linalg.norm(row[['PC1', 'PC2', 'PC3']].values - row[['PC1_original', 'PC2_original', 'PC3_original']].values)
        if pd.notnull(row['PC1_original']) else np.nan, axis=1
    )

    # Keep the original PCA columns as requested
    # merged_clusters_df = merged_clusters_df.drop(columns=['PC1_original', 'PC2_original', 'PC3_original', 'distance_to_centroid_original'])


    # Append metadata columns
    metadata_cols = simulated_df[['lv_feeder_unique_id', 'is_anomalous', 'anomaly_type', 'anomaly_intensity', 'anomaly_id']]
    merged_clusters_df = pd.merge(merged_clusters_df, metadata_cols, on='lv_feeder_unique_id', how='left')

    # Define final columns including original PCA components
    final_columns = [
        'lv_feeder_unique_id',
        'PC1', 'PC2', 'PC3', # Current PCA values
        'PC1_original', 'PC2_original', 'PC3_original', # Original PCA values
        'cluster_k2_pca', 'new_cluster_k2_pca', 'moved_cluster',
        'distance_to_centroid_current', 'pca_shift_distance', 'delta_distance_to_centroid',
        'is_anomalous', 'anomaly_type', 'anomaly_intensity', 'anomaly_id'
    ]

    # Rename 'distance_to_centroid_current' to 'distance_to_centroid'
    merged_clusters_df = merged_clusters_df.rename(columns={'distance_to_centroid_current': 'distance_to_centroid'})

    # Update final_columns list with the renamed column name
    final_columns[final_columns.index('distance_to_centroid_current')] = 'distance_to_centroid'


    # Select and reorder columns, handling potential missing columns if any are added later
    merged_clusters_df = merged_clusters_df[final_columns]


    display(merged_clusters_df.head())

    output_dir = "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Construct output filename and path
    output_filename = os.path.splitext(csv_file)[0] + '_projected_clustered.csv'
    output_filepath = os.path.join(output_dir, output_filename)

    # Save the processed DataFrame
    merged_clusters_df.to_csv(output_filepath, index=False)

    # Print file name, anomaly type and intensity, and moved feeder count
    print(f"Saved processed data to {output_filepath}")

    # Assuming anomaly_type and anomaly_intensity are consistent across rows for a given file
    # Taking the first non-null value for printing
    anomaly_type = merged_clusters_df['anomaly_type'].dropna().iloc[0] if not merged_clusters_df['anomaly_type'].dropna().empty else 'N/A'
    anomaly_intensity = merged_clusters_df['anomaly_intensity'].dropna().iloc[0] if not merged_clusters_df['anomaly_intensity'].dropna().empty else 'N/A'
    moved_feeders_count = merged_clusters_df['moved_cluster'].sum()

    print(f"Anomaly Type: {anomaly_type}, Anomaly Intensity: {anomaly_intensity}, Moved Feeders: {moved_feeders_count}")

    print("-" * 30) # Separator for clarity

Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_unmetered_load_0.05.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_unmetered_load_0.05.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413686,-0.875913,0.916709,-1.413646,-0.876046,0.916995,0,0,False,1.199777,0.000318,-0.00027,0,,,
1,SSEN-400600301502,2.892979,-0.114451,1.195274,2.89153,-0.113339,1.191758,1,1,False,1.545768,0.003962,0.003528,1,unmetered_load,0.05,SSEN-400600301502-unmetered_load-0.05
2,SSEN-400600301503,-1.912485,-0.694745,0.922288,-1.912449,-0.694876,0.92257,0,0,False,1.421911,0.000313,-0.000197,0,,,
3,SSEN-400600301504,2.452913,-0.900514,0.523761,2.452982,-0.900628,0.524049,1,1,False,1.884124,0.000317,-0.000172,0,,,
4,SSEN-400600301505,-2.267568,-0.784923,1.448992,-2.267532,-0.785065,1.449287,0,0,False,2.050233,0.000329,-0.00022,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_unmetered_load_0.05_projected_clustered.csv
Anomaly Type: unmetered_load, Anomaly Intensity: 0.05, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_unmetered_load_0.1.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_unmetered_load_0.1.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413726,-0.87578,0.916422,-1.413646,-0.876046,0.916995,0,0,False,1.199507,0.000637,-0.000541,0,,,
1,SSEN-400600301502,2.894428,-0.115559,1.198784,2.89153,-0.113339,1.191758,1,1,False,1.549292,0.007918,0.007052,1,unmetered_load,0.1,SSEN-400600301502-unmetered_load-0.1
2,SSEN-400600301503,-1.912521,-0.694615,0.922006,-1.912449,-0.694876,0.92257,0,0,False,1.421714,0.000626,-0.000394,0,,,
3,SSEN-400600301504,2.452844,-0.900402,0.523475,2.452982,-0.900628,0.524049,1,1,False,1.883954,0.000632,-0.000342,0,,,
4,SSEN-400600301505,-2.267603,-0.78478,1.448696,-2.267532,-0.785065,1.449287,0,0,False,2.050011,0.000659,-0.000443,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_unmetered_load_0.1_projected_clustered.csv
Anomaly Type: unmetered_load, Anomaly Intensity: 0.1, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_unmetered_load_0.2.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_unmetered_load_0.2.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413805,-0.875514,0.915846,-1.413646,-0.876046,0.916995,0,0,False,1.198964,0.001276,-0.001083,0,,,
1,SSEN-400600301502,2.897327,-0.11776,1.20579,2.89153,-0.113339,1.191758,1,1,False,1.556324,0.015812,0.014085,1,unmetered_load,0.2,SSEN-400600301502-unmetered_load-0.2
2,SSEN-400600301503,-1.912594,-0.694354,0.921441,-1.912449,-0.694876,0.92257,0,0,False,1.421319,0.001252,-0.000788,0,,,
3,SSEN-400600301504,2.452706,-0.900181,0.522907,2.452982,-0.900628,0.524049,1,1,False,1.883619,0.001257,-0.000677,0,,,
4,SSEN-400600301505,-2.267674,-0.78449,1.448098,-2.267532,-0.785065,1.449287,0,0,False,2.049562,0.001328,-0.000892,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_unmetered_load_0.2_projected_clustered.csv
Anomaly Type: unmetered_load, Anomaly Intensity: 0.2, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_offpeak_exploitation_0.05.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_offpeak_exploitation_0.05.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413654,-0.87602,0.916985,-1.413646,-0.876046,0.916995,0,0,False,1.20003,2.9e-05,-1.7e-05,0,,,
1,SSEN-400600301502,2.891569,-0.113648,1.191716,2.89153,-0.113339,1.191758,1,1,False,1.542422,0.000315,0.000182,1,offpeak_exploitation,0.05,SSEN-400600301502-offpeak_exploitation-0.05
2,SSEN-400600301503,-1.912456,-0.694851,0.922558,-1.912449,-0.694876,0.92257,0,0,False,1.422099,2.8e-05,-9e-06,0,,,
3,SSEN-400600301504,2.45297,-0.900596,0.524048,2.452982,-0.900628,0.524049,1,1,False,1.884266,3.4e-05,-3e-05,0,,,
4,SSEN-400600301505,-2.267539,-0.78504,1.449273,-2.267532,-0.785065,1.449287,0,0,False,2.050442,2.9e-05,-1.1e-05,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_offpeak_exploitation_0.05_projected_clustered.csv
Anomaly Type: offpeak_exploitation, Anomaly Intensity: 0.05, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_offpeak_exploitation_0.1.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_offpeak_exploitation_0.1.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413657,-0.875998,0.916967,-1.413646,-0.876046,0.916995,0,0,False,1.200008,5.7e-05,-4e-05,0,,,
1,SSEN-400600301502,2.891604,-0.113946,1.19169,2.89153,-0.113339,1.191758,1,1,False,1.542609,0.000616,0.000369,1,offpeak_exploitation,0.1,SSEN-400600301502-offpeak_exploitation-0.1
2,SSEN-400600301503,-1.912458,-0.694831,0.922538,-1.912449,-0.694876,0.92257,0,0,False,1.422082,5.5e-05,-2.6e-05,0,,,
3,SSEN-400600301504,2.452957,-0.900565,0.524045,2.452982,-0.900628,0.524049,1,1,False,1.884237,6.7e-05,-5.8e-05,0,,,
4,SSEN-400600301505,-2.267539,-0.78502,1.449249,-2.267532,-0.785065,1.449287,0,0,False,2.050421,5.9e-05,-3.3e-05,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_offpeak_exploitation_0.1_projected_clustered.csv
Anomaly Type: offpeak_exploitation, Anomaly Intensity: 0.1, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_offpeak_exploitation_0.2.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_offpeak_exploitation_0.2.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413661,-0.875959,0.916925,-1.413646,-0.876046,0.916995,0,0,False,1.199959,0.000112,-8.9e-05,0,,,
1,SSEN-400600301502,2.891682,-0.114517,1.191663,2.89153,-0.113339,1.191758,1,1,False,1.542984,0.001192,0.000744,1,offpeak_exploitation,0.2,SSEN-400600301502-offpeak_exploitation-0.2
2,SSEN-400600301503,-1.912459,-0.694797,0.922492,-1.912449,-0.694876,0.92257,0,0,False,1.422043,0.000111,-6.5e-05,0,,,
3,SSEN-400600301504,2.452934,-0.90051,0.524033,2.452982,-0.900628,0.524049,1,1,False,1.884183,0.000128,-0.000112,0,,,
4,SSEN-400600301505,-2.267535,-0.784985,1.449196,-2.267532,-0.785065,1.449287,0,0,False,2.050372,0.00012,-8.1e-05,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_offpeak_exploitation_0.2_projected_clustered.csv
Anomaly Type: offpeak_exploitation, Anomaly Intensity: 0.2, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_gradual_drift_0.05.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_gradual_drift_0.05.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413662,-0.87592,0.916758,-1.413646,-0.876046,0.916995,0,0,False,1.199809,0.000269,-0.000238,0,,,
1,SSEN-400600301502,2.89228,-0.114414,1.19467,2.89153,-0.113339,1.191758,1,1,False,1.545193,0.003194,0.002953,1,gradual_drift,0.05,SSEN-400600301502-gradual_drift-0.05
2,SSEN-400600301503,-1.912463,-0.694751,0.922336,-1.912449,-0.694876,0.92257,0,0,False,1.421929,0.000265,-0.000178,0,,,
3,SSEN-400600301504,2.452949,-0.900518,0.52381,2.452982,-0.900628,0.524049,1,1,False,1.884136,0.000265,-0.00016,0,,,
4,SSEN-400600301505,-2.267547,-0.784931,1.449044,-2.267532,-0.785065,1.449287,0,0,False,2.050258,0.000277,-0.000195,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_gradual_drift_0.05_projected_clustered.csv
Anomaly Type: gradual_drift, Anomaly Intensity: 0.05, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_gradual_drift_0.1.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_gradual_drift_0.1.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413679,-0.875794,0.91652,-1.413646,-0.876046,0.916995,0,0,False,1.199571,0.000539,-0.000476,0,,,
1,SSEN-400600301502,2.893037,-0.115489,1.197573,2.89153,-0.113339,1.191758,1,1,False,1.548142,0.006381,0.005902,1,gradual_drift,0.1,SSEN-400600301502-gradual_drift-0.1
2,SSEN-400600301503,-1.912478,-0.694627,0.922102,-1.912449,-0.694876,0.92257,0,0,False,1.421751,0.00053,-0.000357,0,,,
3,SSEN-400600301504,2.452916,-0.900409,0.523572,2.452982,-0.900628,0.524049,1,1,False,1.883977,0.000528,-0.000319,0,,,
4,SSEN-400600301505,-2.267561,-0.784797,1.4488,-2.267532,-0.785065,1.449287,0,0,False,2.050062,0.000556,-0.000392,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_gradual_drift_0.1_projected_clustered.csv
Anomaly Type: gradual_drift, Anomaly Intensity: 0.1, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_gradual_drift_0.2.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_gradual_drift_0.2.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413712,-0.875542,0.916044,-1.413646,-0.876046,0.916995,0,0,False,1.199094,0.001078,-0.000953,0,,,
1,SSEN-400600301502,2.894571,-0.117638,1.203354,2.89153,-0.113339,1.191758,1,1,False,1.554026,0.012735,0.011786,1,gradual_drift,0.2,SSEN-400600301502-gradual_drift-0.2
2,SSEN-400600301503,-1.912507,-0.694378,0.921634,-1.912449,-0.694876,0.92257,0,0,False,1.421394,0.001061,-0.000713,0,,,
3,SSEN-400600301504,2.452848,-0.900193,0.5231,2.452982,-0.900628,0.524049,1,1,False,1.883662,0.001052,-0.000633,0,,,
4,SSEN-400600301505,-2.26759,-0.784527,1.448308,-2.267532,-0.785065,1.449287,0,0,False,2.049666,0.001118,-0.000788,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_gradual_drift_0.2_projected_clustered.csv
Anomaly Type: gradual_drift, Anomaly Intensity: 0.2, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_smoothed_profile_0.05.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_smoothed_profile_0.05.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413685,-0.875971,0.917083,-1.413646,-0.876046,0.916995,0,0,False,1.200098,0.000122,5.1e-05,0,,,
1,SSEN-400600301502,2.89165,-0.114912,1.189753,2.89153,-0.113339,1.191758,1,1,False,1.541859,0.002552,-0.000381,1,smoothed_profile,0.05,SSEN-400600301502-smoothed_profile-0.05
2,SSEN-400600301503,-1.912492,-0.694803,0.922655,-1.912449,-0.694876,0.92257,0,0,False,1.422176,0.000121,6.8e-05,0,,,
3,SSEN-400600301504,2.452975,-0.900539,0.524152,2.452982,-0.900628,0.524049,1,1,False,1.884236,0.000136,-5.9e-05,0,,,
4,SSEN-400600301505,-2.267578,-0.784991,1.449374,-2.267532,-0.785065,1.449287,0,0,False,2.050529,0.000122,7.6e-05,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_smoothed_profile_0.05_projected_clustered.csv
Anomaly Type: smoothed_profile, Anomaly Intensity: 0.05, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_smoothed_profile_0.1.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_smoothed_profile_0.1.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413695,-0.875924,0.917123,-1.413646,-0.876046,0.916995,0,0,False,1.200112,0.000183,6.5e-05,0,,,
1,SSEN-400600301502,2.891551,-0.116236,1.188191,2.89153,-0.113339,1.191758,1,1,False,1.5416,0.004596,-0.00064,1,smoothed_profile,0.1,SSEN-400600301502-smoothed_profile-0.1
2,SSEN-400600301503,-1.912503,-0.694761,0.922689,-1.912449,-0.694876,0.92257,0,0,False,1.422195,0.000175,8.7e-05,0,,,
3,SSEN-400600301504,2.452972,-0.900465,0.524239,2.452982,-0.900628,0.524049,1,1,False,1.884185,0.000251,-0.00011,0,,,
4,SSEN-400600301505,-2.267582,-0.784953,1.449398,-2.267532,-0.785065,1.449287,0,0,False,2.050541,0.000166,8.7e-05,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_smoothed_profile_0.1_projected_clustered.csv
Anomaly Type: smoothed_profile, Anomaly Intensity: 0.1, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_smoothed_profile_0.2.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_smoothed_profile_0.2.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.413614,-0.875929,0.91704,-1.413646,-0.876046,0.916995,0,0,False,1.200016,0.000129,-3.2e-05,0,,,
1,SSEN-400600301502,2.891377,-0.118538,1.185485,2.89153,-0.113339,1.191758,1,1,False,1.541162,0.008149,-0.001078,1,smoothed_profile,0.2,SSEN-400600301502-smoothed_profile-0.2
2,SSEN-400600301503,-1.912414,-0.694779,0.922582,-1.912449,-0.694876,0.92257,0,0,False,1.422066,0.000104,-4.1e-05,0,,,
3,SSEN-400600301504,2.452984,-0.900366,0.524347,2.452982,-0.900628,0.524049,1,1,False,1.884115,0.000397,-0.000181,0,,,
4,SSEN-400600301505,-2.267457,-0.784996,1.449237,-2.267532,-0.785065,1.449287,0,0,False,2.050352,0.000114,-0.000101,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_smoothed_profile_0.2_projected_clustered.csv
Anomaly Type: smoothed_profile, Anomaly Intensity: 0.2, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_flatline_consumption_5.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_flatline_consumption_5.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.414723,-0.87405,0.921543,-1.413646,-0.876046,0.916995,0,0,False,1.203142,0.005081,0.003094,0,,,
1,SSEN-400600301502,2.887728,-0.140585,1.135475,2.89153,-0.113339,1.191758,1,1,False,1.520126,0.062646,-0.022114,1,flatline_consumption,5.0,SSEN-400600301502-flatline_consumption-5
2,SSEN-400600301503,-1.913632,-0.692864,0.927164,-1.912449,-0.694876,0.92257,0,0,False,1.42547,0.005153,0.003362,0,,,
3,SSEN-400600301504,2.452718,-0.898805,0.528278,2.452982,-0.900628,0.524049,1,1,False,1.883582,0.004613,-0.000713,0,,,
4,SSEN-400600301505,-2.268825,-0.782983,1.453978,-2.267532,-0.785065,1.449287,0,0,False,2.054207,0.005293,0.003753,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_flatline_consumption_5_projected_clustered.csv
Anomaly Type: flatline_consumption, Anomaly Intensity: 5.0, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_flatline_consumption_10.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_flatline_consumption_10.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.414723,-0.87405,0.921543,-1.413646,-0.876046,0.916995,0,0,False,1.203142,0.005081,0.003094,0,,,
1,SSEN-400600301502,2.887728,-0.140585,1.135475,2.89153,-0.113339,1.191758,1,1,False,1.520126,0.062646,-0.022114,1,flatline_consumption,10.0,SSEN-400600301502-flatline_consumption-10
2,SSEN-400600301503,-1.913632,-0.692864,0.927164,-1.912449,-0.694876,0.92257,0,0,False,1.42547,0.005153,0.003362,0,,,
3,SSEN-400600301504,2.452718,-0.898805,0.528278,2.452982,-0.900628,0.524049,1,1,False,1.883582,0.004613,-0.000713,0,,,
4,SSEN-400600301505,-2.268825,-0.782983,1.453978,-2.267532,-0.785065,1.449287,0,0,False,2.054207,0.005293,0.003753,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_flatline_consumption_10_projected_clustered.csv
Anomaly Type: flatline_consumption, Anomaly Intensity: 10.0, Moved Feeders: 0
------------------------------
Processing file: /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_flatline_consumption_20.csv
Loaded /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/features_flatline_consumption_20.csv


Unnamed: 0,lv_feeder_unique_id,PC1,PC2,PC3,PC1_original,PC2_original,PC3_original,cluster_k2_pca,new_cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,delta_distance_to_centroid,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,SSEN-400600301501,-1.414723,-0.87405,0.921543,-1.413646,-0.876046,0.916995,0,0,False,1.203142,0.005081,0.003094,0,,,
1,SSEN-400600301502,2.887728,-0.140585,1.135475,2.89153,-0.113339,1.191758,1,1,False,1.520126,0.062646,-0.022114,1,flatline_consumption,20.0,SSEN-400600301502-flatline_consumption-20
2,SSEN-400600301503,-1.913632,-0.692864,0.927164,-1.912449,-0.694876,0.92257,0,0,False,1.42547,0.005153,0.003362,0,,,
3,SSEN-400600301504,2.452718,-0.898805,0.528278,2.452982,-0.900628,0.524049,1,1,False,1.883582,0.004613,-0.000713,0,,,
4,SSEN-400600301505,-2.268825,-0.782983,1.453978,-2.267532,-0.785065,1.449287,0,0,False,2.054207,0.005293,0.003753,0,,,


Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_flatline_consumption_20_projected_clustered.csv
Anomaly Type: flatline_consumption, Anomaly Intensity: 20.0, Moved Feeders: 0
------------------------------


## Prepare data for pca

### Subtask:
Prepare the simulated data for PCA transformation by selecting the 12 normalized feature columns.


**Reasoning**:
I will select the 12 specified feature columns from the `simulated_df` and store them in a new DataFrame called `features_for_pca` as instructed by the subtask.



In [None]:
features_for_pca = simulated_df[[
    'avg_daily_consumption_per_device',
    'std_daily_consumption_per_device',
    'avg_total_consumption_daily',
    'std_total_consumption_daily',
    'avg_max_consumption_daily',
    'std_max_consumption_daily',
    'max_to_mean_ratio',
    'std_to_mean_ratio',
    'per_device_peak_ratio',
    'max_to_min_ratio',
    'weekday_weekend_ratio',
    'mean_daily_time_at_peak'
]]
display(features_for_pca.head())

Unnamed: 0,avg_daily_consumption_per_device,std_daily_consumption_per_device,avg_total_consumption_daily,std_total_consumption_daily,avg_max_consumption_daily,std_max_consumption_daily,max_to_mean_ratio,std_to_mean_ratio,per_device_peak_ratio,max_to_min_ratio,weekday_weekend_ratio,mean_daily_time_at_peak
0,-0.827331,-0.379248,-0.579241,-0.421154,-0.510509,-0.420581,0.661995,0.886974,-0.578056,0.620394,-0.285701,0.161642
1,-0.583338,-0.493753,1.249189,1.63611,1.454633,1.63601,-0.415931,-0.255803,-0.593288,-0.446118,-0.099695,0.184021
2,-0.718357,-0.301199,-0.705913,-0.63321,-0.653227,-0.633162,1.039041,0.97584,-0.415895,0.758406,-0.192835,0.146723
3,-0.85584,-0.76776,0.978409,1.042522,1.004555,1.042926,-0.724716,-0.669562,-0.875431,-0.588679,0.235086,0.176561
4,-0.848854,-0.337756,-0.761177,-0.657335,-0.702802,-0.656844,1.269677,1.272838,-0.498435,1.188158,-0.660332,0.161642


## Apply pca

### Subtask:
Apply the loaded PCA model to the selected 12 normalized feature columns and keep the first 3 principal components.


**Reasoning**:
Apply the loaded PCA model to the selected features and keep the first 3 components, then store them in a new DataFrame.



In [None]:
pca_components = pca_model.transform(features_for_pca)
pca_df = pd.DataFrame(pca_components[:, :3], columns=['PC1', 'PC2', 'PC3'])
display(pca_df.head())



Unnamed: 0,PC1,PC2,PC3
0,-1.413646,-0.876046,0.916995
1,2.89153,-0.113339,1.191758
2,-1.912449,-0.694876,0.92257
3,2.452982,-0.900628,0.524049
4,-2.267532,-0.785065,1.449287


## Apply kmeans

### Subtask:
Assign new cluster labels to the PCA-transformed data using the loaded KMeans model.


**Reasoning**:
Apply the loaded KMeans model to the PCA-transformed data to get the new cluster assignments and add them to the pca_df DataFrame.



In [None]:
new_cluster_labels = kmeans_model.predict(pca_df)
pca_df['new_cluster_k2_pca'] = new_cluster_labels
display(pca_df.head())

Unnamed: 0,PC1,PC2,PC3,new_cluster_k2_pca
0,-1.413646,-0.876046,0.916995,0
1,2.89153,-0.113339,1.191758,1
2,-1.912449,-0.694876,0.92257,0
3,2.452982,-0.900628,0.524049,1
4,-2.267532,-0.785065,1.449287,0


## Compare cluster assignments

### Subtask:
Merge the new cluster assignments with the original cluster assignments based on `lv_feeder_unique_id` and add a `moved_cluster` flag.


**Reasoning**:
Merge the new cluster assignments with the original cluster assignments and create the moved_cluster flag.



In [None]:
original_clusters_subset = original_clusters_df[['lv_feeder_unique_id', 'cluster_k2_pca']]
merged_clusters_df = pd.merge(pca_df, original_clusters_subset, left_index=True, right_index=True, how='left')
merged_clusters_df['moved_cluster'] = (merged_clusters_df['new_cluster_k2_pca'] != merged_clusters_df['cluster_k2_pca']).fillna(False)
display(merged_clusters_df.head())

Unnamed: 0,PC1,PC2,PC3,new_cluster_k2_pca,lv_feeder_unique_id,cluster_k2_pca,moved_cluster
0,-1.413646,-0.876046,0.916995,0,SSEN-400600301501,0,False
1,2.89153,-0.113339,1.191758,1,SSEN-400600301502,1,False
2,-1.912449,-0.694876,0.92257,0,SSEN-400600301503,0,False
3,2.452982,-0.900628,0.524049,1,SSEN-400600301504,1,False
4,-2.267532,-0.785065,1.449287,0,SSEN-400600301505,0,False


## Calculate distance to centroid

### Subtask:
Compute the Euclidean distance between each feeder's 3D PCA point and its assigned cluster centroid.


**Reasoning**:
Compute the Euclidean distance between each feeder's 3D PCA point and its assigned cluster centroid and store it in a new column.



In [None]:
import numpy as np

centroids = kmeans_model.cluster_centers_

def calculate_distance(row):
    cluster_id = row['new_cluster_k2_pca']
    centroid = centroids[cluster_id]
    point = row[['PC1', 'PC2', 'PC3']].values
    return np.linalg.norm(point - centroid)

merged_clusters_df['distance_to_centroid'] = merged_clusters_df.apply(calculate_distance, axis=1)
display(merged_clusters_df.head())

Unnamed: 0,PC1,PC2,PC3,new_cluster_k2_pca,lv_feeder_unique_id,cluster_k2_pca,moved_cluster,distance_to_centroid
0,-1.413646,-0.876046,0.916995,0,SSEN-400600301501,0,False,1.200047
1,2.89153,-0.113339,1.191758,1,SSEN-400600301502,1,False,1.54224
2,-1.912449,-0.694876,0.92257,0,SSEN-400600301503,0,False,1.422108
3,2.452982,-0.900628,0.524049,1,SSEN-400600301504,1,False,1.884296
4,-2.267532,-0.785065,1.449287,0,SSEN-400600301505,0,False,2.050454


## Store results (optional but recommended)

### Subtask:
Save the processed data (including new cluster assignments, moved cluster flag, and distance to centroid) for each CSV.


**Reasoning**:
To save the processed data, I will first define the output directory. Then, inside the loop, I will construct the output filename based on the input filename and save the `merged_clusters_df` DataFrame to a CSV file in the specified directory, excluding the index. I will also add a print statement to confirm that the file has been saved.



In [None]:
output_dir = "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for csv_file in csv_files:
    current_csv_path = os.path.join(simulated_data_dir, csv_file)
    simulated_df = pd.read_csv(current_csv_path)
    features_for_pca = simulated_df.iloc[:, 1:13]
    pca_components = pca_model.transform(features_for_pca)
    pca_df = pd.DataFrame(pca_components[:, :3], columns=['PC1', 'PC2', 'PC3'])
    new_cluster_labels = kmeans_model.predict(pca_df)
    pca_df['new_cluster_k2_pca'] = new_cluster_labels

    # Add lv_feeder_unique_id to pca_df for merging
    pca_df['lv_feeder_unique_id'] = simulated_df['lv_feeder_unique_id']

    original_clusters_subset = original_clusters_df[['lv_feeder_unique_id', 'cluster_k2_pca']]
    merged_clusters_df = pd.merge(pca_df, original_clusters_subset, on='lv_feeder_unique_id', how='left')

    merged_clusters_df['moved_cluster'] = (merged_clusters_df['new_cluster_k2_pca'] != merged_clusters_df['cluster_k2_pca'])

    centroids = kmeans_model.cluster_centers_
    def calculate_distance(row):
        cluster_id = row['new_cluster_k2_pca']
        centroid = centroids[cluster_id]
        point = row[['PC1', 'PC2', 'PC3']].values
        return np.linalg.norm(point - centroid)

    merged_clusters_df['distance_to_centroid'] = merged_clusters_df.apply(calculate_distance, axis=1)

    output_filename = os.path.splitext(csv_file)[0] + '_processed.csv'
    output_filepath = os.path.join(output_dir, output_filename)
    merged_clusters_df.to_csv(output_filepath, index=False)
    print(f"Saved processed data to {output_filepath}")




Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_unmetered_load_0.05_processed.csv




Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_unmetered_load_0.1_processed.csv
Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_unmetered_load_0.2_processed.csv




Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_offpeak_exploitation_0.05_processed.csv
Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_offpeak_exploitation_0.1_processed.csv




Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_offpeak_exploitation_0.2_processed.csv
Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_gradual_drift_0.05_processed.csv




Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_gradual_drift_0.1_processed.csv
Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_gradual_drift_0.2_processed.csv




Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_smoothed_profile_0.05_processed.csv
Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_smoothed_profile_0.1_processed.csv




Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_smoothed_profile_0.2_processed.csv
Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_flatline_consumption_5_processed.csv




Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_flatline_consumption_10_processed.csv
Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_flatline_consumption_20_processed.csv




## Summary:

### Data Analysis Key Findings

*   The process successfully loaded 15 simulated feature matrix CSV files from the specified directory.
*   For each file, pre-trained PCA and KMeans (K=2) models were successfully loaded from pickle files.
*   The original cluster assignments were loaded from a CSV file.
*   A specific set of 12 normalized feature columns were selected from each simulated feature matrix for PCA transformation.
*   The loaded PCA model was applied to the selected features, and the first 3 principal components were retained.
*   The loaded KMeans model was applied to the 3D PCA data to assign new cluster labels.
*   The new cluster assignments were successfully merged with the original cluster assignments using `lv_feeder_unique_id`.
*   A `moved_cluster` flag was successfully created, indicating whether a feeder's cluster assignment changed compared to the original.
*   The Euclidean distance from each feeder's 3D PCA point to its assigned cluster centroid was calculated and added as a new column.
*   The processed data for each simulated feature matrix, including the new cluster, moved cluster flag, and distance to centroid, was successfully saved to a new CSV file in a designated output directory.

### Insights or Next Steps

*   The processed files containing cluster assignments, moved cluster flags, and distances to centroids are now available for further analysis, such as identifying which simulated scenarios cause feeders to change clusters or exhibit larger distances from their assigned centroids.
*   These processed results can be used to evaluate the impact of different simulated conditions on feeder clustering and to potentially identify scenarios that represent anomalous behavior based on cluster changes or distance from centroids.


# Task
Process each CSV file in "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/" by loading the data, applying PCA using the model from "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/pca_model.pkl", applying KMeans (K=2) using the model from "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/kmeans_k2_pca_model.pkl", comparing new cluster assignments with original assignments from "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/full_time_series_cluster_assignments_pca.csv", calculating distance to centroid and PCA shift distance, preserving metadata columns (`is_anomalous`, `anomaly_type`, `anomaly_intensity`, `anomaly_id`), saving the resulting DataFrame to the same folder with the suffix "_projected_clustered.csv", and printing the file name, anomaly type and intensity, and the number of feeders flagged as moved.

## Compute pca shift distance

### Subtask:
If available, compare the current (PC1, PC2, PC3) with the original PCA projection for the same feeder from the baseline dataset and calculate the Euclidean distance as `pca_shift_distance`.


**Reasoning**:
Calculate the Euclidean distance between the current PCA components and the original PCA components for each feeder.



In [None]:
import numpy as np

# Ensure original_clusters_df has the required PCA columns
original_pca_subset = original_clusters_df[['lv_feeder_unique_id', 'PC1', 'PC2', 'PC3']]

# Merge the current data with the original PCA data
merged_clusters_df = pd.merge(merged_clusters_df, original_pca_subset, on='lv_feeder_unique_id', how='left', suffixes=('_current', '_original'))

# Calculate the Euclidean distance between current and original PCA points
merged_clusters_df['pca_shift_distance'] = merged_clusters_df.apply(
    lambda row: np.linalg.norm(row[['PC1_current', 'PC2_current', 'PC3_current']].values - row[['PC1_original', 'PC2_original', 'PC3_original']].values)
    if pd.notnull(row['PC1_original']) else np.nan, axis=1
)

# Drop the original PCA columns after calculation
merged_clusters_df = merged_clusters_df.drop(columns=['PC1_original', 'PC2_original', 'PC3_original'])

display(merged_clusters_df.head())

Unnamed: 0,PC1_current,PC2_current,PC3_current,new_cluster_k2_pca,lv_feeder_unique_id,cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance
0,-1.413646,-0.876046,0.916995,0,SSEN-400600301501,0,False,1.200047,1.110223e-16
1,2.89153,-0.113339,1.191758,1,SSEN-400600301502,1,False,1.54224,4.1633360000000003e-17
2,-1.912449,-0.694876,0.92257,0,SSEN-400600301503,0,False,1.422108,0.0
3,2.452982,-0.900628,0.524049,1,SSEN-400600301504,1,False,1.884296,1.110223e-16
4,-2.267532,-0.785065,1.449287,0,SSEN-400600301505,0,False,2.050454,2.482534e-16


## Append metadata

### Subtask:
Ensure the original metadata columns (`is_anomalous`, `anomaly_type`, `anomaly_intensity`, `anomaly_id`) from the input file are preserved in the final DataFrame.


**Reasoning**:
Select the metadata columns from the simulated_df and merge them into merged_clusters_df.



In [None]:
metadata_cols = simulated_df[['lv_feeder_unique_id', 'is_anomalous', 'anomaly_type', 'anomaly_intensity', 'anomaly_id']]
merged_clusters_df = pd.merge(merged_clusters_df, metadata_cols, on='lv_feeder_unique_id', how='left')
display(merged_clusters_df.head())

Unnamed: 0,PC1_current,PC2_current,PC3_current,new_cluster_k2_pca,lv_feeder_unique_id,cluster_k2_pca,moved_cluster,distance_to_centroid,pca_shift_distance,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id
0,-1.413646,-0.876046,0.916995,0,SSEN-400600301501,0,False,1.200047,1.110223e-16,0,,,
1,2.89153,-0.113339,1.191758,1,SSEN-400600301502,1,False,1.54224,4.1633360000000003e-17,1,flatline_consumption,20.0,SSEN-400600301502-flatline_consumption-20
2,-1.912449,-0.694876,0.92257,0,SSEN-400600301503,0,False,1.422108,0.0,0,,,
3,2.452982,-0.900628,0.524049,1,SSEN-400600301504,1,False,1.884296,1.110223e-16,0,,,
4,-2.267532,-0.785065,1.449287,0,SSEN-400600301505,0,False,2.050454,2.482534e-16,0,,,


## Save & logging

### Subtask:
Save the final DataFrame to the specified folder with the required naming convention and print the file name, anomaly type and intensity, and the number of feeders flagged as moved.


**Reasoning**:
Save the final DataFrame to the specified folder with the required naming convention and print the file name, anomaly type and intensity, and the number of feeders flagged as moved.



In [None]:
output_dir = "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Construct output filename and path
output_filename = os.path.splitext(csv_file)[0] + '_projected_clustered.csv'
output_filepath = os.path.join(output_dir, output_filename)

# Save the processed DataFrame
merged_clusters_df.to_csv(output_filepath, index=False)

# Print file name, anomaly type and intensity, and moved feeder count
print(f"Saved processed data to {output_filepath}")

# Assuming anomaly_type and anomaly_intensity are consistent across rows for a given file
# Taking the first non-null value for printing
anomaly_type = merged_clusters_df['anomaly_type'].dropna().iloc[0] if not merged_clusters_df['anomaly_type'].dropna().empty else 'N/A'
anomaly_intensity = merged_clusters_df['anomaly_intensity'].dropna().iloc[0] if not merged_clusters_df['anomaly_intensity'].dropna().empty else 'N/A'
moved_feeders_count = merged_clusters_df['moved_cluster'].sum()

print(f"Anomaly Type: {anomaly_type}, Anomaly Intensity: {anomaly_intensity}, Moved Feeders: {moved_feeders_count}")

Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_flatline_consumption_20_projected_clustered.csv
Anomaly Type: flatline_consumption, Anomaly Intensity: 20.0, Moved Feeders: 0


## Summary:

### Data Analysis Key Findings

*   The Euclidean distance between the current and original 3-component PCA projections was calculated for each feeder and stored in the `pca_shift_distance` column.
*   The original metadata columns (`is_anomalous`, `anomaly_type`, `anomaly_intensity`, `anomaly_id`) from the input CSV files were successfully preserved and added to the processed DataFrame.
*   For the processed file `features_flatline_consumption_20.csv`, the anomaly type was `flatline_consumption` with an intensity of `20.0`.
*   In the file `features_flatline_consumption_20.csv`, zero feeders were flagged as having moved clusters compared to their original assignments.

### Insights or Next Steps

*   The `pca_shift_distance` provides a quantitative measure of how much a feeder's behavior has changed in the PCA space, which could be used as an anomaly indicator, especially when combined with a threshold.
*   Investigate the cases where feeders are flagged as having moved clusters or have high PCA shift distances to understand the characteristics of these changes and their relationship to the simulated anomalies.


# Task
Process each CSV file in "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/01. Simulated Feature Matrices/", applying pre-trained PCA ("/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/pca_model.pkl") and KMeans (K=2, "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/kmeans_k2_pca_model.pkl") models to the 12 normalized feature columns. For each file, project the data to 3 principal components (PC1, PC2, PC3), assign new cluster labels, compare with original cluster assignments ("/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/full_time_series_cluster_assignments_pca.csv") to identify 'moved_cluster' (1 if different, 0 otherwise), calculate 'distance_to_centroid' in PCA space, compute 'pca_shift_distance' by comparing with original PCA projections from the baseline dataset (if available), calculate 'delta_distance_to_centroid' by subtracting the original distance to centroid from the new distance, preserve original metadata ('is_anomalous', 'anomaly_type', 'anomaly_intensity', 'anomaly_id'), and save the resulting DataFrame to the same folder with the suffix "_projected_clustered.csv". Log the processed file name, anomaly type and intensity, and the number of moved feeders.

## Load original distance to centroid

### Subtask:
Load the original distance to centroid data from the baseline dataset.


**Reasoning**:
Load the original cluster assignments and PCA data and select the required columns for the original distance to centroid.



In [None]:
original_clusters_df = pd.read_csv("/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/full_time_series_cluster_assignments_pca.csv")
original_distance_subset = original_clusters_df[['lv_feeder_unique_id', 'distance_to_centroid']]
display(original_distance_subset.head())

KeyError: "['distance_to_centroid'] not in index"

**Reasoning**:
The previous code failed because the 'distance_to_centroid' column does not exist in the `original_clusters_df`. I need to calculate it first before selecting it.



In [None]:
import numpy as np

# Load the original cluster assignments and PCA data
original_clusters_df = pd.read_csv("/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/full_time_series_cluster_assignments_pca.csv")

# Load the KMeans model to get centroids
kmeans_model_path = "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/00. Models & Cluster Assignments/kmeans_k2_pca_model.pkl"
with open(kmeans_model_path, 'rb') as f:
    kmeans_model = pickle.load(f)

centroids = kmeans_model.cluster_centers_

# Calculate distance to centroid for the original data
def calculate_distance(row):
    cluster_id = row['cluster_k2_pca']
    centroid = centroids[cluster_id]
    point = row[['PC1', 'PC2', 'PC3']].values
    return np.linalg.norm(point - centroid)

original_clusters_df['distance_to_centroid'] = original_clusters_df.apply(calculate_distance, axis=1)

# Select the required columns
original_distance_subset = original_clusters_df[['lv_feeder_unique_id', 'distance_to_centroid']]
display(original_distance_subset.head())


Unnamed: 0,lv_feeder_unique_id,distance_to_centroid
0,SSEN-400600301501,1.200047
1,SSEN-400600301502,1.54224
2,SSEN-400600301503,1.422108
3,SSEN-400600301504,1.884296
4,SSEN-400600301505,2.050454


## Calculate delta distance to centroid

### Subtask:
Compute the change in distance to centroid by subtracting the original distance from the new distance.


**Reasoning**:
Merge the current data with the original distance to centroid data and calculate the change in distance.



In [None]:
# Merge the current data with the original distance to centroid data
merged_clusters_df = pd.merge(merged_clusters_df, original_distance_subset, on='lv_feeder_unique_id', how='left', suffixes=('_current', '_original'))

# Calculate the change in distance to centroid
merged_clusters_df['delta_distance_to_centroid'] = merged_clusters_df['distance_to_centroid_current'] - merged_clusters_df['distance_to_centroid_original']

# Drop the original distance column after calculation
merged_clusters_df = merged_clusters_df.drop(columns=['distance_to_centroid_original'])

display(merged_clusters_df.head())

Unnamed: 0,PC1_current,PC2_current,PC3_current,new_cluster_k2_pca,lv_feeder_unique_id,cluster_k2_pca,moved_cluster,distance_to_centroid_current,pca_shift_distance,is_anomalous,anomaly_type,anomaly_intensity,anomaly_id,delta_distance_to_centroid
0,-1.413646,-0.876046,0.916995,0,SSEN-400600301501,0,False,1.200047,1.110223e-16,0,,,,2.220446e-16
1,2.89153,-0.113339,1.191758,1,SSEN-400600301502,1,False,1.54224,4.1633360000000003e-17,1,flatline_consumption,20.0,SSEN-400600301502-flatline_consumption-20,0.0
2,-1.912449,-0.694876,0.92257,0,SSEN-400600301503,0,False,1.422108,0.0,0,,,,0.0
3,2.452982,-0.900628,0.524049,1,SSEN-400600301504,1,False,1.884296,1.110223e-16,0,,,,0.0
4,-2.267532,-0.785065,1.449287,0,SSEN-400600301505,0,False,2.050454,2.482534e-16,0,,,,4.440892e-16


## Save & logging

### Subtask:
Save & logging


**Reasoning**:
Save the final DataFrame to the specified folder with the required naming convention and print the file name, anomaly type and intensity, and the number of feeders flagged as moved.



In [None]:
output_dir = "/content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Construct output filename and path
output_filename = os.path.splitext(csv_file)[0] + '_projected_clustered.csv'
output_filepath = os.path.join(output_dir, output_filename)

# Save the processed DataFrame
merged_clusters_df.to_csv(output_filepath, index=False)

# Print file name, anomaly type and intensity, and moved feeder count
print(f"Saved processed data to {output_filepath}")

# Assuming anomaly_type and anomaly_intensity are consistent across rows for a given file
# Taking the first non-null value for printing
anomaly_type = merged_clusters_df['anomaly_type'].dropna().iloc[0] if not merged_clusters_df['anomaly_type'].dropna().empty else 'N/A'
anomaly_intensity = merged_clusters_df['anomaly_intensity'].dropna().iloc[0] if not merged_clusters_df['anomaly_intensity'].dropna().empty else 'N/A'
moved_feeders_count = merged_clusters_df['moved_cluster'].sum()

print(f"Anomaly Type: {anomaly_type}, Anomaly Intensity: {anomaly_intensity}, Moved Feeders: {moved_feeders_count}")

Saved processed data to /content/drive/MyDrive/Anomaly Detection Work/1. Clustering/02. Processed Simulated Feature Matrices/features_flatline_consumption_20_projected_clustered.csv
Anomaly Type: flatline_consumption, Anomaly Intensity: 20.0, Moved Feeders: 0


## Summary:

### Data Analysis Key Findings

*   The original baseline dataset did not contain a pre-calculated 'distance\_to\_centroid' column; this metric needed to be computed based on the PCA projections and KMeans centroids from the baseline data.
*   For the processed file `features_flatline_consumption_20_projected_clustered.csv`, the anomaly type was 'flatline\_consumption' with an intensity of 20.0.
*   For the `features_flatline_consumption_20_projected_clustered.csv` file, zero feeders changed their cluster assignment compared to the baseline clustering.

### Insights or Next Steps

*   The 'delta\_distance\_to\_centroid' and 'moved\_cluster' metrics provide valuable information for identifying how anomalies affect the position of data points within the PCA space and their cluster assignments, which can be used for anomaly detection.
*   Analyze the distribution of 'delta\_distance\_to\_centroid' and 'moved\_cluster' across different anomaly types and intensities to understand their impact on the clustering structure.
