In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

def predict_celestial_body_name_lstm(csv_file_path):
    """
    Trains an LSTM model to predict the 'name' of celestial bodies from their
    positional and velocity data.

    Args:
        csv_file_path (str): The path to the CSV file containing the data.
    """
    try:
        # Load the dataset
        df = pd.read_csv(csv_file_path)
    except FileNotFoundError:
        print(f"Error: The file '{csv_file_path}' was not found.")
        print("Please ensure the CSV file is in the same directory as the script or provide the full path.")
        return
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return

    # Define features (X) and target (y)
    feature_columns = ['x_au', 'y_au', 'z_au', 'vx_au_per_day', 'vy_au_per_day', 'vz_au_per_day']
    target_column = 'name'

    # Check if all required columns are present
    required_columns = feature_columns + [target_column]
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        print(f"Error: The following required columns are missing from the CSV: {', '.join(missing_cols)}")
        return

    # Drop rows with missing values in features or target
    df.dropna(subset=required_columns, inplace=True)

    if df.empty:
        print("Error: No data remaining after dropping rows with missing values. Cannot proceed.")
        return
        
    X = df[feature_columns]
    y = df[target_column]

    # Encode the target variable 'name' as it's categorical
    label_encoder = LabelEncoder()
    y_integer_encoded = label_encoder.fit_transform(y)
    num_classes = len(label_encoder.classes_)
    
    # One-hot encode the integer encoded target variable for Keras
    y_one_hot_encoded = to_categorical(y_integer_encoded, num_classes=num_classes)

    # Scale the features (important for neural networks)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Reshape X for LSTM: (samples, timesteps, features)
    # Here, we treat each observation as a sequence of 1 timestep.
    X_scaled_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

    # Split data into training and testing sets
    # Stratify by y_integer_encoded to ensure balanced classes in splits
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled_reshaped, y_one_hot_encoded, test_size=0.2, random_state=42, stratify=y_integer_encoded
    )
    
    # Also keep a copy of y_test in integer encoded format for classification_report
    _, _, y_train_int, y_test_int = train_test_split(
        X_scaled_reshaped, y_integer_encoded, test_size=0.2, random_state=42, stratify=y_integer_encoded
    )


    if len(X_train) == 0 or len(X_test) == 0:
        print("Error: Not enough data to split into training and testing sets after preprocessing.")
        print(f"Original data size: {len(df)}, Training data size: {len(X_train)}, Test data size: {len(X_test)}")
        return

    # Define the LSTM model
    model = Sequential()
    model.add(LSTM(units=64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False)) # units can be tuned
    # model.add(LSTM(units=64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)) # If adding more LSTM layers
    # model.add(LSTM(units=32))
    model.add(Dropout(0.3)) # Dropout for regularization
    model.add(Dense(units=32, activation='relu')) # Hidden dense layer
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax')) # Output layer

    # Compile the model
    # Optimizer, loss function, and metrics can be tuned
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    print("\nLSTM Model Summary:")
    model.summary()

    print("\nTraining the LSTM model...")
    try:
        # Train the model
        # Epochs and batch_size can be tuned
        history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)
    except Exception as e:
        print(f"Error during model training: {e}")
        return
    print("Model training complete.")

    # Evaluate the model on the test set
    print("\nEvaluating the model on the test set...")
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Loss: {loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    # Make predictions on the test set
    print("\nMaking predictions on the test set...")
    y_pred_probabilities = model.predict(X_test)
    y_pred_encoded = np.argmax(y_pred_probabilities, axis=1) # Convert probabilities to class labels

    # Generate classification report
    # y_test_int contains the original integer labels for the test set
    report = classification_report(y_test_int, y_pred_encoded, target_names=label_encoder.classes_, zero_division=0)

    print("\nClassification Report:")
    print(report)

    # Example of how to predict on new, unseen data:
    # Create a dummy new data point (ensure it has the same features as training data)
    # new_data_point_df = pd.DataFrame([{
    #     'x_au': 0.0, 'y_au': 1.0, 'z_au': 0.0,
    #     'vx_au_per_day': -0.01, 'vy_au_per_day': 0.0, 'vz_au_per_day': 0.0
    # }])
    # # Scale the new data using the SAME scaler fitted on the training data
    # new_data_scaled = scaler.transform(new_data_point_df[feature_columns])
    # # Reshape for LSTM
    # new_data_reshaped = new_data_scaled.reshape((new_data_scaled.shape[0], 1, new_data_scaled.shape[1]))
    # # Predict
    # prediction_probabilities = model.predict(new_data_reshaped)
    # predicted_class_encoded = np.argmax(prediction_probabilities, axis=1)
    # # Convert encoded prediction back to original name
    # predicted_name = label_encoder.inverse_transform(predicted_class_encoded)
    # print(f"\nExample prediction for new data: {predicted_name[0]}")


if __name__ == "__main__":
    # IMPORTANT: Replace this with the actual path to your CSV file
    # if it's not in the same directory as the script.
    csv_file = './dataset/solar_system_positions_with_velocity.csv' 
    # Ensure you have TensorFlow installed: pip install tensorflow
    predict_celestial_body_name_lstm(csv_file)



LSTM Model Summary:


  super().__init__(**kwargs)



Training the LSTM model...
Epoch 1/50
[1m1439/1439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.1389 - loss: 2.7986 - val_accuracy: 0.3092 - val_loss: 1.7489
Epoch 2/50
[1m1439/1439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.2909 - loss: 1.8102 - val_accuracy: 0.3870 - val_loss: 1.4656
Epoch 3/50
[1m1439/1439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3417 - loss: 1.5786 - val_accuracy: 0.4562 - val_loss: 1.3127
Epoch 4/50
[1m1439/1439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3668 - loss: 1.4752 - val_accuracy: 0.4461 - val_loss: 1.2465
Epoch 5/50
[1m1439/1439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3810 - loss: 1.4107 - val_accuracy: 0.4879 - val_loss: 1.1900
Epoch 6/50
[1m1439/1439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3901 - loss: 1.3811 - val_accuracy: 0.4861 - val_l