In [None]:
import os
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Flatten
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import Sequence
from keras.callbacks import ModelCheckpoint


In [None]:
# Function to read and preprocess CSV files
def preprocess_csv(filename):
    data = pd.read_csv(filename)
    return data

# Directory containing your CSV files with just time and V
csv_directory = "/home/arutkeerthi/SolarProject/HV_NN/csv_1"

# Get a list of all CSV files in the directory
csv_files = [os.path.join(csv_directory, filename) for filename in os.listdir(csv_directory) if filename.endswith(".csv")]

# Extract labels from filenames - 0 is for cme event not present and 1 is for partial halo present
labels = [int(file_path.split("_")[-1].split(".")[0]) for file_path in csv_files]

# Split filenames and labels for training and testing
X_train_files, X_test_files, y_train, y_test = train_test_split(csv_files, labels, test_size=0.2, random_state=42)

In [None]:
# Standardize velocity data (mean=0, std=1)
scaler = StandardScaler()

class DataGenerator(Sequence):
    def __init__(self, files, labels, batch_size=32, seq_length=None):
        self.files = files
        self.labels = labels
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.scaler = StandardScaler()

    def __len__(self):
        return int(np.ceil(len(self.files) / self.batch_size))

    def __getitem__(self, index):
        batch_files = self.files[index * self.batch_size:(index + 1) * self.batch_size]
        batch_labels = self.labels[index * self.batch_size:(index + 1) * self.batch_size]
        batch_data = []
        for file_path in batch_files:
            data = preprocess_csv(file_path)
            velocities = self.scaler.fit_transform(data['V'].values.reshape(-1, 1))
            if self.seq_length is not None:
                velocities = pad_sequences([velocities], dtype='float32', maxlen=self.seq_length, padding='post', value=0.0)[0]
            batch_data.append(velocities)
        return np.array(batch_data), np.array(batch_labels)


In [None]:
# Define a fixed time sequence length(so that no matter when the event occurs the LSTM sees how back to go in time to check with the label)
seq_length = 100  # since we have around 100 frames per video

# Create data generators with the specified sequence length
train_generator = DataGenerator(X_train_files, y_train, batch_size=32, seq_length=seq_length)
test_generator = DataGenerator(X_test_files, y_test, batch_size=32, seq_length=seq_length)


# Define a ModelCheckpoint callback to save the best model
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_accuracy', mode='max', verbose=1)

# Build the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(seq_length, 1), return_sequences=True, kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(LSTM(64, return_sequences=True, kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(Flatten())
model.add(Dense(32, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

# Train the model using data generators with early stopping and model checkpoint
history = model.fit_generator(generator=train_generator,
                              validation_data=test_generator,
                              epochs=100,  # You can adjust the number of epochs
                              steps_per_epoch=len(train_generator),
                              validation_steps=len(test_generator),
                              callbacks=[model_checkpoint])

# Load the best model weights from the saved file
best_model = keras.models.load_model('best_model.h5')

# Evaluate the best model on training and test data
train_loss, train_accuracy = best_model.evaluate_generator(generator=train_generator, steps=len(train_generator))
test_loss, test_accuracy = best_model.evaluate_generator(generator=test_generator, steps=len(test_generator))

# Print the statistics
print("Best Model Statistics:")
print(f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy * 100:.2f}%")
best_model.save('best_model.h5') #can use the model later on any time series and velocity data-> gotta preprocess and scale just like done previously above