# LSTM MODEL CREATION FOR FEATURE EXTRACTION

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from keras._tf_keras.keras.models import Sequential 
from keras._tf_keras.keras.layers import Dense, LSTM, Dropout

def preprocess_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        vrm_data = df['VRM'].values.reshape(-1, 1)
        
        # Check for NaN values
        if np.any(np.isnan(vrm_data)):
            print(f"NaN values found in {file_path}.")
            # Check if there are any valid values to compute the mean
            if np.count_nonzero(~np.isnan(vrm_data)) > 0:
                mean_value = np.nanmean(vrm_data)
                print(f"Filling NaNs with the mean: {mean_value}")
                vrm_data = np.nan_to_num(vrm_data, nan=mean_value)
            else:
                print(f"All values are NaN in {file_path}. Skipping this file.")
                return None  # Skip this file if all values are NaN
        
        scaler = MinMaxScaler()
        vrm_data_scaled = scaler.fit_transform(vrm_data)
        return vrm_data_scaled
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def fourier_transform(data):
    fft_data = np.fft.fft(data)
    fft_data = np.abs(fft_data)
    return fft_data

folder_path = 'VRM_data'

processed_data = []
file_names = []
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        data = preprocess_csv(file_path)
        if data is not None:
            processed_data.append(data)
            file_names.append(filename)



sequence_length = 30
# Preparing Data for LSTM
sequence_length = 30
X = []
for data in processed_data:
    for i in range(len(data) - sequence_length + 1):
        X.append(data[i:i + sequence_length])

# Convert to NumPy array and reshape
X = np.array(X)
X = X.reshape(X.shape[0], sequence_length, 1)  # Reshape to (number_of_samples, sequence_length, num_features)

# Check for NaN values in X
if np.any(np.isnan(X)):
    print("NaN values found in X. Exiting.")
    exit()

# Create a more complex LSTM model
model = Sequential()
model.add(LSTM(100, return_sequences=True, input_shape=(sequence_length, 1)))  # First LSTM layer
model.add(Dropout(0.2))  # Dropout layer to prevent overfitting
model.add(LSTM(50, return_sequences=True))  # Second LSTM layer
model.add(Dropout(0.2))  # Another Dropout layer
model.add(LSTM(25))  # Third LSTM layer
model.add(Dropout(0.2))  # Dropout layer
model.add(Dense(30))  # Output layer
model.compile(optimizer='adam', loss='mse')

# Fit the model
model.fit(X, X, epochs=30, batch_size=32)  # Increased epochs for better training

  super().__init__(**kwargs)


Epoch 1/30
[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 25ms/step - loss: 0.0263
Epoch 2/30
[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 24ms/step - loss: 0.0034
Epoch 3/30
[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 22ms/step - loss: 0.0032
Epoch 4/30
[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1483s[0m 126ms/step - loss: 0.0030
Epoch 5/30
[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 20ms/step - loss: 0.0030
Epoch 6/30
[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 19ms/step - loss: 0.0029
Epoch 7/30
[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 22ms/step - loss: 0.0028
Epoch 8/30
[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 23ms/step - loss: 0.0028
Epoch 9/30
[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 23ms/step - loss: 0.0028
Epoch 10/30
[1m11789/1178

<keras.src.callbacks.history.History at 0x1c298890d10>

# FEATURE EXTRACTION
## Two Features
### LSTM & Fourier Transform

In [3]:
# Predict LSTM features

lstm_features = model.predict(X)
lstm_features = lstm_features.reshape(lstm_features.shape[0], -1)
fourier_features = [fourier_transform(data.flatten()) for data in processed_data]

# Combine features for clustering
combined_features = []
for lstm_f, fourier_f in zip(lstm_features, fourier_features):
    combined_features.append(np.concatenate((lstm_f, fourier_f[:10])))


[1m11789/11789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 5ms/step


# K Means Clustering
### 10 Clusters

In [4]:
# KMeans clustering
kmeans = KMeans(n_clusters=10, random_state=0)
kmeans.fit(combined_features)
labels = kmeans.labels_

# Create directories for clusters
os.makedirs('Signature Fault Clusters Version 16 Final/VRM', exist_ok=True)
for i in range(10):
    os.makedirs(os.path.join('Signature Fault Clusters Version 16 Final/VRM', f'VRM Cluster {i}'), exist_ok=True)

# Move files to their respective clusters
for i, filename in enumerate(file_names):
    cluster_label = labels[i]
    source_path = os.path.join(folder_path, filename)
    destination_path = os.path.join('Signature Fault Clusters Version 16 Final/VRM', f'VRM Cluster {cluster_label}', filename)
    os.rename(source_path, destination_path)
print(f"Files successfully clustered.")

Files successfully clustered.


# To Find Accuracy of K Means Clustering

In [5]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Evaluate clustering without ground truth

# Silhouette Score: Measures how similar a data point is to its cluster vs other clusters
silhouette_avg = silhouette_score(combined_features, labels)
print(f"Silhouette Score: {silhouette_avg:.4f}")

# Davies-Bouldin Index: Measures the ratio of within-cluster distances to between-cluster distances
db_index = davies_bouldin_score(combined_features, labels)
print(f"Davies-Bouldin Index: {db_index:.4f}")

Silhouette Score: 0.3460
Davies-Bouldin Index: 0.9572


Rating for Clustering Performance
The clustering performance can be evaluated as follows:

1. Silhouette Score: 0.3460
Interpretation:
The Silhouette Score measures how similar data points are within their own cluster compared to other clusters.
The score ranges from -1 to 1, where:
> 0.5 indicates well-defined clusters.
0.2 to 0.5 indicates moderately defined clusters.
< 0.2 suggests overlapping or poorly defined clusters.
A score of 0.3460 suggests that the clustering is moderately effective, but there is room for improvement in cluster separation.
2. Davies-Bouldin Index (DBI): 0.9572
Interpretation:
DBI measures the average similarity between clusters, with lower values indicating better clustering.
Typical ranges for DBI:
< 1.0: Excellent clustering.
1.0–1.5: Good clustering.
> 1.5: Poor clustering.
A DBI of 0.9572 indicates excellent clustering, where the clusters are compact and well-separated.
Overall Rating
Considering the moderate Silhouette Score and the excellent DBI, the clustering performance can be rated as "Good".
While the DBI reflects strong inter-cluster separation, the moderate Silhouette Score suggests some clusters might still overlap or have less cohesion.
Recommendation:
To further improve performance:
Tune the number of clusters (k): Experiment with different values for better cluster definitions.
Feature Engineering: Explore additional or more relevant features for the dataset.
Clustering Algorithm: Compare KMeans with other algorithms like DBSCAN or Gaussian Mixture Models for potentially better results.

# CSV Files to Graphs 

In [15]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Path to the folder containing CSV files
csv_folder = 'Signature Fault Clusters Version 16 Final/VRM/VRM Cluster 9'
output_folder = 'Signature Fault Clusters Version 16 Final/VRM Graph/VRM Cluster 9'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Iterate over each CSV file in the folder
for filename in os.listdir(csv_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(csv_folder, filename)

        # Read the CSV file
        data = pd.read_csv(file_path)

        # Extract the relevant columns
        if {'VRM'}.issubset(data.columns):
            plt.figure(figsize=(10, 6))

            # Plot each voltage series
            plt.plot(data.index, data['VRM'], label='VRM (Red Phase Voltage)', color='red')
            # plt.plot(data.index, data['VYM'], label='VYM (Yellow Phase Voltage)', color='yellow')
            # plt.plot(data.index, data['VBM'], label='VBM (Blue Phase Voltage)', color='blue')

            # Adding labels and title
            plt.xlabel('Time (Index)')
            plt.ylabel('Voltage')
            #plt.title(f'Voltage Measurements Over Time - {filename}')
            #plt.legend()
            plt.grid(True)

            # Save the plot to the output folder
            plot_filename = os.path.splitext(filename)[0] + '_voltage_plot.png'
            plt.savefig(os.path.join(output_folder, plot_filename))
            plt.close()

        else:
            print(f"Columns VYM, VBM, VRM not found in {filename}")

print("Graphs have been generated and stored in the output folder.")

Graphs have been generated and stored in the output folder.
