In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy.fft import fft, fftfreq
import pywt

# Function to preprocess and extract features from a CSV file
def preprocess_and_extract_features(file_path):
    try:
        df = pd.read_csv(file_path)
        
        # Extract VRM column and drop NaN values
        vrm_data = df['VRM'].dropna().values  # Drop NaN values
        
        if len(vrm_data) == 0:
            print(f"No valid VRM data in {file_path}. Skipping this file.")
            return None
        
        # Normalize the data
        scaler = MinMaxScaler()
        normalized_vrm = scaler.fit_transform(vrm_data.reshape(-1, 1))

        # Feature extraction using Fourier Transformation
        N = len(normalized_vrm)
        yf = fft(normalized_vrm.flatten())
        xf = fftfreq(N, 1)  # Assuming a sampling rate of 1 for now
        positive_frequencies = xf[:N//2]
        magnitudes = 2/N * np.abs(yf[:N//2])
        fft_features = magnitudes

        # Feature extraction using Wavelet Transformation
        coeffs = pywt.wavedec(normalized_vrm.flatten(), 'db4', level=4)  # Example wavelet and level
        wavelet_features = np.concatenate([np.array(c).flatten() for c in coeffs])

        # Combine Features
        all_features = np.concatenate((fft_features, wavelet_features))
        return all_features

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Folder containing the CSV files
folder_path = 'testing_data_trial_2'  # Replace with your folder path

# List to store features for all files
all_features = []

# Loop through all CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        features = preprocess_and_extract_features(file_path)
        if features is not None:
            all_features.append(features)

# Convert features to a numpy array
features_matrix = np.array(all_features)

# Apply KMeans clustering
n_clusters = 10  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(features_matrix)
labels = kmeans.labels_

# Create cluster folders
output_folder = 'Signature Fault Clusters Version 3'
os.makedirs(output_folder, exist_ok=True)

for i in range(n_clusters):
    cluster_folder = os.path.join(output_folder, f'VRM/VRM Cluster {i}')
    os.makedirs(cluster_folder, exist_ok=True)

# Move files to respective cluster folders
file_list = [filename for filename in os.listdir(folder_path) if filename.endswith(".csv")]
for i, file in enumerate(file_list):
    source_path = os.path.join(folder_path, file)
    # Check if the file has valid features before moving
    if i < len(labels):  # Ensure we only access labels for valid features
        destination_path = os.path.join(output_folder, f'VRM/VRM Cluster {labels[i]}', file)
        os.rename(source_path, destination_path)

print("Clustering and file organization completed.")

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values