In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

def predict_celestial_body_name(csv_file_path):
    """
    Trains a model to predict the 'name' of celestial bodies from their
    positional and velocity data.

    Args:
        csv_file_path (str): The path to the CSV file containing the data.
    """
    try:
        # Load the dataset
        df = pd.read_csv(csv_file_path)
    except FileNotFoundError:
        print(f"Error: The file '{csv_file_path}' was not found.")
        print("Please ensure the CSV file is in the same directory as the script or provide the full path.")
        return
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return

    # Define features (X) and target (y)
    feature_columns = ['x_au', 'y_au', 'z_au', 'vx_au_per_day', 'vy_au_per_day', 'vz_au_per_day']
    target_column = 'name'

    # Check if all required columns are present
    required_columns = feature_columns + [target_column]
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        print(f"Error: The following required columns are missing from the CSV: {', '.join(missing_cols)}")
        return

    # Drop rows with missing values in features or target
    # This is a simple way to handle missing data; more sophisticated methods exist.
    df.dropna(subset=required_columns, inplace=True)

    if df.empty:
        print("Error: No data remaining after dropping rows with missing values. Cannot proceed.")
        return
        
    X = df[feature_columns]
    y = df[target_column]

    # Encode the target variable 'name' as it's categorical
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Split data into training and testing sets
    # test_size=0.2 means 20% of data is used for testing
    # random_state ensures reproducibility of the split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    if len(X_train) == 0 or len(X_test) == 0:
        print("Error: Not enough data to split into training and testing sets after preprocessing.")
        print(f"Original data size: {len(df)}, Training data size: {len(X_train)}, Test data size: {len(X_test)}")
        print("This might be due to a very small dataset or too many missing values.")
        return

    # Initialize and train the Random Forest Classifier
    # n_estimators is the number of trees in the forest
    # random_state ensures reproducibility of the model training
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    print("\nTraining the model...")
    try:
        model.fit(X_train, y_train)
    except Exception as e:
        print(f"Error during model training: {e}")
        return
    print("Model training complete.")

    # Make predictions on the test set
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0)

    print("\nModel Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)

    # To predict on new data (example):
    # new_data_point = pd.DataFrame([{
    #     'x_au': 0.0, 'y_au': 1.0, 'z_au': 0.0,
    #     'vx_au_per_day': -0.01, 'vy_au_per_day': 0.0, 'vz_au_per_day': 0.0
    # }])
    # prediction_encoded = model.predict(new_data_point)
    # prediction_name = label_encoder.inverse_transform(prediction_encoded)
    # print(f"\nExample prediction for new data: {prediction_name[0]}")

if __name__ == "__main__":
    # IMPORTANT: Replace this with the actual path to your CSV file
    # if it's not in the same directory as the script.
    csv_file = './dataset/solar_system_positions_with_velocity.csv'
    predict_celestial_body_name(csv_file)



Training the model...
Model training complete.

Making predictions on the test set...

Model Evaluation:
Accuracy: 0.6280

Classification Report:
                      precision    recall  f1-score   support

1 MERCURY BARYCENTER       0.08      0.08      0.08       365
              10 SUN       1.00      1.00      1.00       365
         199 MERCURY       0.12      0.12      0.12       365
  2 VENUS BARYCENTER       0.10      0.09      0.09       366
           299 VENUS       0.11      0.12      0.11       365
  3 EARTH BARYCENTER       0.88      1.00      0.93       365
            301 MOON       1.00      1.00      1.00       366
           399 EARTH       1.00      0.86      0.92       366
   4 MARS BARYCENTER       1.00      1.00      1.00       365
5 JUPITER BARYCENTER       1.00      1.00      1.00       365
 6 SATURN BARYCENTER       0.45      0.48      0.46       366
           601 MIMAS       0.61      0.43      0.50       365
       602 ENCELADUS       0.94      0.94     