In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.fft import fft, fftfreq
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# 1. Data Loading and Preprocessing
def load_and_preprocess_data(folder_path):
    all_data = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            try:
                df = pd.read_csv(os.path.join(folder_path, filename))
                # Assuming 'VRM' column contains the fault voltage data
                if 'VRM' in df.columns:  
                    vrm_data = df['VRM'].values
                    # Noise Reduction (Simple Moving Average) - adjust window as needed
                    window_size = 5  
                    vrm_data = np.convolve(vrm_data, np.ones(window_size), 'valid') / window_size

                    # Feature extraction
                    features = {
                      'mean': np.mean(vrm_data),
                      'std': np.std(vrm_data),
                      'variance': np.var(vrm_data),
                    }
                    
                    # Fourier Transformation
                    N = len(vrm_data)
                    yf = fft(vrm_data)
                    xf = fftfreq(N, 1)  # Assuming a sampling rate of 1
                    dominant_frequency_index = np.argmax(np.abs(yf[1:N//2])) + 1 #Ignore DC component
                    features['dominant_frequency'] = xf[dominant_frequency_index]

                    all_data.append(list(features.values()))
                    filenames.append(filename)
                else:
                    print(f"Warning: 'VRM' column not found in {filename}. Skipping.")
            except pd.errors.ParserError:
                print(f"Warning: Could not parse {filename}. Skipping.")

    return np.array(all_data), filenames

# 2. LSTM Feature Engineering (Optional, if you want to use LSTM features)
def extract_lstm_features(data):
    #Reshape data
    X = np.array(data).reshape(len(data), 1, len(data[0]))
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(1, len(data[0]))))  # Adjust input_shape
    model.add(Dense(len(data[0])))  # Output size matches input features
    model.compile(optimizer='adam', loss='mse')
    model.fit(X, X, epochs=10, verbose=0)  # Train on the same data to get LSTM representations
    lstm_features = model.predict(X)
    return lstm_features.reshape(len(data), len(data[0]))

# 3. Clustering
def perform_clustering(data, n_clusters=10):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    clusters = kmeans.fit_predict(scaled_data)
    return clusters

# 4. Organize Files
def organize_files(folder_path, filenames, clusters):
    for i, cluster_label in enumerate(clusters):
        cluster_folder = os.path.join(folder_path, f"cluster_{cluster_label}")
        os.makedirs(cluster_folder, exist_ok=True)
        source_file = os.path.join(folder_path, filenames[i])
        destination_file = os.path.join(cluster_folder, filenames[i])
        os.rename(source_file, destination_file)

# Main execution
folder_path = 'testing_data_trial_2_with_features'  # Replace with the actual folder path
data, filenames = load_and_preprocess_data(folder_path)

#Optional LSTM Feature extraction. Comment out if not needed.
lstm_data = extract_lstm_features(data)
clusters = perform_clustering(lstm_data)

clusters = perform_clustering(data)
organize_files(folder_path, filenames, clusters)

  super().__init__(**kwargs)


[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [None]:
import numpy as np