# Retrieve data

In [1]:
try:
    from google.colab import drive
    is_running_on_colab = True
except ImportError:
    is_running_on_colab = False

if is_running_on_colab:
    # Mount Google Drive
    drive.mount('/content/drive')

In [2]:
import time
start_notebook = time.time()

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

KeyboardInterrupt: 

In [4]:
# Option to generate mock data
generate_mock_data = False # Set to True to generate mock data
max_mock_depth_ft = 2000 # Define maximum depth for mock data (if used)

In [6]:
# well_name = "LLB-10" # Original well name, no longer used with the new dataset
if not generate_mock_data:
  if is_running_on_colab:
    # Load data from Google Drive, if running on Google Colab
    # IMPORTANT: Update this path if solar_system_positions_with_velocity.csv is in a different location
    colab_repo_dir = "/content/drive/MyDrive/riset-fttm-gdrive/cuml-tf-model-hydrocarbon-prediction"
    data_path = f"{colab_repo_dir}/data/solar_system_positions_with_velocity.csv"
    # If the file is uploaded directly to Colab session, use its direct path e.g., "/content/solar_system_positions_with_velocity.csv"
    # data_path = "/content/solar_system_positions_with_velocity.csv" 
    print(f"Loading data from: {data_path}")
    data = pd.read_csv(data_path, sep=',')
  else:
    # Load data from local directory
    # IMPORTANT: Ensure solar_system_positions_with_velocity.csv is in this path or update it
    local_data_path = "./dataset/solar_system_positions_with_velocity.csv"
    print(f"Loading data from: {local_data_path}")
    data = pd.read_csv(local_data_path, sep=',')
  print("Data loaded successfully. Shape:", data.shape)
  print("Data head:\n", data.head())

Loading data from: ./dataset/solar_system_positions_with_velocity.csv
Data loaded successfully. Shape: (63945, 9)
Data head:
          date                  name  naif_id      x_au      y_au      z_au  \
0  2020-01-01  1 MERCURY BARYCENTER        1 -0.063377 -0.460841 -0.031843   
1  2020-01-02  1 MERCURY BARYCENTER        1 -0.041067 -0.462569 -0.034031   
2  2020-01-03  1 MERCURY BARYCENTER        1 -0.018637 -0.462941 -0.036119   
3  2020-01-04  1 MERCURY BARYCENTER        1  0.003848 -0.461948 -0.038101   
4  2020-01-05  1 MERCURY BARYCENTER        1  0.026321 -0.459583 -0.039969   

   vx_au_per_day  vy_au_per_day  vz_au_per_day  
0       0.022228      -0.002402      -0.002235  
1       0.022381      -0.001052      -0.002139  
2       0.022468       0.000309      -0.002036  
3       0.022490       0.001678      -0.001926  
4       0.022445       0.003053      -0.001810  


In [7]:
if generate_mock_data:
    # This section is for generating mock data if the primary data source is not used.
    # It's retained for flexibility but won't run if generate_mock_data is False.
    print(f"Generating mock data up to {max_mock_depth_ft} ft...") # Adjusted print message
    mock_depth_step = 0.5
    mock_y_values = np.arange(0, max_mock_depth_ft, mock_depth_step) # Changed DEPT to y_values for clarity
    num_mock_rows = len(mock_y_values)

    mock_data_dict = {'Y_AXIS_MOCK': mock_y_values} # Using a generic y-axis name

    # Using generic feature names for mock data, as original features are not relevant
    feature_cols_for_mock = [f'MockFeature_{i}' for i in range(8)] 

    for col in feature_cols_for_mock:
        mock_data_dict[col] = np.random.rand(num_mock_rows) * 100

    # Assuming a binary classification task for mock data, as in the original notebook
    mock_data_dict['mock_target_class'] = np.random.randint(0, 2, num_mock_rows)

    data = pd.DataFrame(mock_data_dict)

    print(f"Mock data generated with {num_mock_rows} rows and columns: {list(data.columns)}.")
    print("Mock data head:")
    print(data.head())

In [8]:
# df=data[['CALI','DRHO','GR','MR','NPHI_corr','PEF','RHOB_CORR','ROP']] # Original features
# TODO: Update feature_cols based on the columns in solar_system_positions_with_velocity.csv
# Assuming columns like 'x', 'y', 'z', 'vx', 'vy', 'vz' are relevant numerical features.
# Please inspect your CSV and adjust these column names accordingly.
# For example, if your CSV has these columns:
feature_cols = ['x', 'y', 'z', 'vx', 'vy', 'vz'] # Example feature columns

# Ensure these columns exist in your 'data' DataFrame and are numeric.
# Handle non-numeric columns (e.g., 'Object', 'Date', 'Time') appropriately if they are part of feature_cols
# For instance, 'Object' might need encoding if used as a feature.
# 'Date' and 'Time' might need conversion to numerical representations (e.g., timestamps, cyclical features).

# Drop rows with NaN in feature_cols to avoid issues with scalers/models
if not generate_mock_data: # Only apply if not using mock data
    print(f"Original data shape before dropping NaNs from features: {data.shape}")
    data.dropna(subset=feature_cols, inplace=True)
    print(f"Data shape after dropping NaNs from features: {data.shape}")
    df = data[feature_cols].copy() # Use .copy() to avoid SettingWithCopyWarning
else: # If using mock data, df is based on mock_data_dict feature_cols_for_mock
    df = data[[col for col in data.columns if col not in ['Y_AXIS_MOCK', 'mock_target_class']]].copy()

print("Selected features (df) head:\n", df.head())

Original data shape before dropping NaNs from features: (63945, 9)


KeyError: ['x', 'y', 'z', 'vx', 'vy', 'vz']

# Data Preparation

## Train/Test Splitting

In [None]:
# Misalkan 'data' adalah DataFrame Anda dan 'df' adalah fitur yang telah Anda ekstrak
X = df  # Fitur
# y = data['hydrocarbon_formation_class']  # Original Label

# TODO: Define your target variable 'y' based on the new dataset.
# For example, if 'Object' is a categorical target in your solar_system_positions_with_velocity.csv:
# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# y = pd.Series(label_encoder.fit_transform(data['Object']), index=X.index) # Ensure index alignment with X
# Or if you have another target column:
# y = data['your_target_column_name'].loc[X.index] # Ensure index alignment

# For now, y, y_train, y_test will be undefined or based on mock data if generate_mock_data is True.
# Model training and evaluation cells below will need to be adapted once 'y' is defined for the solar system data.
if generate_mock_data:
    y = data['mock_target_class']
else:
    y = None # Placeholder, user needs to define this for the solar system data

if y is not None:
    # Ensure X and y have the same index before splitting
    common_index = X.index.intersection(y.index)
    X = X.loc[common_index]
    y = y.loc[common_index]
    
    # Split data menjadi training dan testing set
    # Removed stratify=y as y might not be suitable for stratification or is undefined for the new data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
else:
    # If y is not defined, split only X for unsupervised tasks or if y will be defined later
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
    y_train, y_test = None, None # Explicitly set to None
    print("Target variable 'y' is not defined for the solar system data. y_train and y_test are None.")
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    print("Model training and evaluation cells requiring 'y' will need adaptation.")

In [None]:
if X_train is not None:
    print("X_train description:")
    print(X_train.describe())
else:
    print("X_train is None.")

## Data Imbalance Handling (SMOTE - Commented out as target 'y' is not defined for solar system data)

In [None]:
from imblearn.over_sampling import SVMSMOTE

In [None]:
# This cell is commented out because SMOTE requires a defined 'y_train' for classification.
# If you define 'y_train' for a classification task with the solar system data and it's imbalanced,
# you can uncomment and adapt this cell.

# print("SMOTE section (commented out):")
# if y_train is not None and X_train is not None and not y_train.empty:
#     # Initialize SVMSMOTE
#     smote = SVMSMOTE(random_state=42)
# 
#     # Apply SMOTE to the training data
#     print("Original training data shape:", X_train.shape, y_train.shape)
#     print("Original training class distribution:\n", y_train.value_counts())
# 
#     try:
#         X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
# 
#         print("\nShape of training data after SMOTE:", X_train_smote.shape, y_train_smote.shape)
#         print("Class distribution after SMOTE:\n", y_train_smote.value_counts())
# 
#         # Update X_train and y_train to be the oversampled versions
#         X_train = X_train_smote
#         y_train = y_train_smote
#     except ValueError as e:
#         print(f"\nSMOTE could not be applied: {e}")
#         print("This might happen if a class has too few samples for SVMSMOTE.")
#         print("Original X_train and y_train will be used.")
# else:
#     print("\nSMOTE was not applied as y_train or X_train is None or y_train is empty.")
#     print("Original X_train and y_train (if defined) will be used.")
print("SMOTE is commented out as 'y_train' is likely undefined for the solar system data or task is not classification.")

## Apply Quantile Transformation

In [None]:
from sklearn.preprocessing import QuantileTransformer
def transform_quantile(X_train_in, X_test_in, X_in):
    qt_transformer = QuantileTransformer(output_distribution='normal', random_state=42) # Added random_state for reproducibility
    # Ensure inputs are not None before proceeding
    if X_train_in is None or X_test_in is None or X_in is None:
        print("Quantile transformation skipped as one or more input DataFrames (X_train, X_test, X) are None.")
        return X_train_in, X_test_in, X_in
        
    # Ensure inputs are not empty
    if X_train_in.empty or X_test_in.empty or X_in.empty:
        print("Quantile transformation skipped as one or more input DataFrames are empty.")
        return X_train_in, X_test_in, X_in

    X_train_qt = pd.DataFrame(qt_transformer.fit_transform(X_train_in), columns=X_train_in.columns, index=X_train_in.index)
    X_test_qt = pd.DataFrame(qt_transformer.transform(X_test_in), columns=X_test_in.columns, index=X_test_in.index)
    X_qt = pd.DataFrame(qt_transformer.transform(X_in), columns=X_in.columns, index=X_in.index) # Transform the whole X for consistency if needed elsewhere
    
    return X_train_qt, X_test_qt, X_qt

In [None]:
if X_train is not None and X_test is not None and X is not None:
    X_train, X_test, X = transform_quantile(X_train, X_test, X)
    print("Quantile transformation applied.")
else:
    print("Skipping quantile transformation as X_train, X_test, or X is None.")

## Feature Scaling

karena menggunakan Quatile transformation dengan output gaussian, masing masing kolom secara otomatis ditransformasi ke distribusi normal baku, atau distribusi normal dengan rataan nol dan standar deviasi 1, oleh karena itu tidak diperlukan tambahan scaling.

In [None]:
if X_train is not None:
    print("X_train description after quantile transformation:")
    print(X_train.describe())
else:
    print("X_train is None, cannot describe.")

# Training setup

In [None]:
import time
import numpy as np
from sklearn.model_selection import GridSearchCV


sk_train_accuracy={}
sk_test_accuracy={}

cu_train_accuracy={}
cu_test_accuracy={}

sk_crossValidation_accuracy={}
cu_crossValidation_accuracy={}

sk_models = {} #sklearn models
cu_models = {} #cuml models

sk_times = {}
cu_times = {}

sk_pred = {}
cu_pred = {}

sk_pred_times = {}
cu_pred_times = {}

In [None]:
# test CuML availability & is working
try:
    import cuml
    # Check if X_train is not None and not empty before creating dummy data for KMeans
    if X_train is not None and not X_train.empty:
        # Create minimal dummy data based on X_train's structure if possible, or generic
        if X_train.shape[1] >= 2:
            dummy_data_np = X_train.iloc[:4, :2].to_numpy() # Use a small slice of actual data
        else: # Fallback if X_train has fewer than 2 columns
            dummy_data_np = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8]], dtype=np.float32)
        kmeans = cuml.KMeans(n_clusters=2, random_state=42)
        kmeans.fit(dummy_data_np)
        has_cuml = True
        print("cuML is found and working")
    else:
        has_cuml = False
        print("X_train is None or empty, cannot perform cuML KMeans test. Assuming cuML is not fully usable.")
except ImportError:
    has_cuml = False
    print("cuML not found. Please ensure cuML is installed.")
except Exception as e:
    has_cuml = False
    print(f"cuML couldn't be initialized or used. Error: {e}")

# Models

## SVM

In [None]:
model_name = "SVM"

In [None]:
from sklearn.svm import SVC as SklearnSVC
# Attempt to import cuML's SVC
try:
    from cuml.svm import SVC as cuMLSVC
    cuml_svc_available = True
except ImportError:
    cuml_svc_available = False
    print("cuML SVC not available. Please ensure cuML is installed and compatible with your environment.")

In [None]:
# Parameter grid for both models
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'random_state': [42] # Added for reproducibility
}

# 1) scikit-learn SVM with GridSearchCV
sk_models[model_name] = GridSearchCV(
    estimator=SklearnSVC(kernel='rbf', probability=True), # probability=True for predict_proba if needed later
    param_grid=param_grid,
    cv=5,
    verbose=1, # Reduced verbosity for cleaner output
    n_jobs=-1
)

time_start_sk = time.time()
if X_train is not None and y_train is not None:
    print(f"Fitting scikit-learn {model_name}...")
    sk_models[model_name].fit(X_train, y_train)
    sk_times[model_name] = time.time() - time_start_sk
    print(f"scikit-learn GridSearchCV training time ({model_name}) : {sk_times[model_name]:.2f} seconds")
    print(f"scikit-learn Best parameters ({model_name}): {sk_models[model_name].best_params_}")
else:
    sk_times[model_name] = 0
    print(f"Skipping scikit-learn {model_name} fitting as X_train or y_train is not defined.")

# 2) cuML SVM with the same GridSearchCV
if has_cuml and cuml_svc_available:
    cu_models[model_name] = GridSearchCV(
        estimator=cuMLSVC(kernel='rbf', probability=True, random_state=42),
        param_grid=param_grid,
        cv=5,
        verbose=1,
        n_jobs=1 
    )

    time_start_cu = time.time()
    if X_train is not None and y_train is not None:
        print(f"Fitting cuML {model_name}...")
        cu_models[model_name].fit(X_train, y_train)
        cu_times[model_name] = time.time() - time_start_cu
        print(f"cuml GridSearchCV training time ({model_name}) : {cu_times[model_name]:.2f} seconds")
        print(f"cuml Best parameters ({model_name}): {cu_models[model_name].best_params_}")
    else:
        cu_times[model_name] = 0
        print(f"Skipping cuML {model_name} fitting as X_train or y_train is not defined.")
elif has_cuml and not cuml_svc_available:
    print(f"cuML is available, but cuML {model_name} (SVC) is not. Skipping cuML {model_name} benchmark.")
else:
    print(f"cuML is not installed or GPU not available. Skipping cuML {model_name} benchmark.")

if X_train is None or y_train is None: # Print this only if fitting was skipped for sklearn
   print(f"scikit-learn GridSearchCV training time ({model_name}) : N/A (X_train or y_train not defined)")
   print(f"scikit-learn Best parameters ({model_name}): N/A (X_train or y_train not defined)")

## K-Nearest Neighbors

In [None]:
model_name = "KNN"

In [None]:
from sklearn.neighbors import KNeighborsClassifier as SklearnKNeighborsClassifier
# Attempt to import cuML's KNeighborsClassifier
try:
    from cuml.neighbors import KNeighborsClassifier as cuMLKNeighborsClassifier
    cuml_knn_available = True
except ImportError:
    cuml_knn_available = False
    print("cuML KNeighborsClassifier not available.")

In [None]:
# Parameter grid for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# 1) scikit-learn KNN with GridSearchCV
sk_models[model_name] = GridSearchCV(
    estimator=SklearnKNeighborsClassifier(),
    param_grid=param_grid_knn,
    cv=5,
    verbose=1,
    n_jobs=-1
)

time_start_sk = time.time()
if X_train is not None and y_train is not None:
    print(f"Fitting scikit-learn {model_name}...")
    sk_models[model_name].fit(X_train, y_train)
    sk_times[model_name] = time.time() - time_start_sk
    print(f"scikit-learn GridSearchCV training time ({model_name}) : {sk_times[model_name]:.2f} seconds")
    print(f"scikit-learn Best parameters ({model_name}): {sk_models[model_name].best_params_}")
else:
    sk_times[model_name] = 0
    print(f"Skipping scikit-learn {model_name} fitting as X_train or y_train is not defined.")

# 2) cuML KNN with the same GridSearchCV (adapted for cuML)
if has_cuml and cuml_knn_available:
    cu_models[model_name] = GridSearchCV(
        estimator=cuMLKNeighborsClassifier(), 
        param_grid=param_grid_knn, 
        cv=5,
        verbose=1,
        n_jobs=1 
    )
    time_start_cu = time.time()
    if X_train is not None and y_train is not None:
        print(f"Fitting cuML {model_name}...")
        cu_models[model_name].fit(X_train, y_train)
        cu_times[model_name] = time.time() - time_start_cu
        print(f"cuml GridSearchCV training time ({model_name}) : {cu_times[model_name]:.2f} seconds")
        print(f"cuml Best parameters ({model_name}): {cu_models[model_name].best_params_}")
    else:
        cu_times[model_name] = 0
        print(f"Skipping cuML {model_name} fitting as X_train or y_train is not defined.")
elif has_cuml and not cuml_knn_available:
    print(f"cuML is available, but cuML {model_name} (KNeighborsClassifier) is not. Skipping cuML {model_name} benchmark.")
else:
    print(f"cuML is not installed or GPU not available. Skipping cuML {model_name} benchmark.")

if X_train is None or y_train is None:
   print(f"scikit-learn GridSearchCV training time ({model_name}) : N/A (X_train or y_train not defined)")
   print(f"scikit-learn Best parameters ({model_name}): N/A (X_train or y_train not defined)")

## Random Forest

In [None]:
model_name = "RF"

In [None]:
from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier
# Attempt to import cuML's RandomForestClassifier
try:
    from cuml.ensemble import RandomForestClassifier as cuMLRandomForestClassifier
    cuml_rf_available = True
except ImportError:
    cuml_rf_available = False
    print("cuML RandomForestClassifier not available.")

In [None]:
# Parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200], 
    'max_depth': [None, 10, 20],    
    'min_samples_split': [2, 5], 
    'min_samples_leaf': [1, 2],   
    'random_state': [42]
}

# 1) scikit-learn RandomForest with GridSearchCV
sk_models[model_name] = GridSearchCV(
    estimator=SklearnRandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    cv=5, 
    verbose=1,
    n_jobs=-1
)

time_start_sk = time.time()
if X_train is not None and y_train is not None:
    print(f"Fitting scikit-learn {model_name}...")
    sk_models[model_name].fit(X_train, y_train)
    sk_times[model_name] = time.time() - time_start_sk
    print(f"scikit-learn GridSearchCV training time ({model_name}) : {sk_times[model_name]:.2f} seconds")
    print(f"scikit-learn Best parameters ({model_name}): {sk_models[model_name].best_params_}")
else:
    sk_times[model_name] = 0
    print(f"Skipping scikit-learn {model_name} fitting as X_train or y_train is not defined.")


# 2) cuML RandomForest with GridSearchCV (adapted for cuML)
if has_cuml and cuml_rf_available:
    # cuML RandomForestClassifier might have slightly different parameter names or accepted values.
    # n_estimators, max_depth, min_samples_split, min_samples_leaf, random_state are generally compatible.
    cu_models[model_name] = GridSearchCV(
        estimator=cuMLRandomForestClassifier(random_state=42), 
        param_grid=param_grid_rf, 
        cv=5, 
        verbose=1,
        n_jobs=1 
    )

    time_start_cu = time.time()
    if X_train is not None and y_train is not None:
        print(f"Fitting cuML {model_name}...")
        cu_models[model_name].fit(X_train, y_train) 
        cu_times[model_name] = time.time() - time_start_cu
        print(f"cuml GridSearchCV training time ({model_name}) : {cu_times[model_name]:.2f} seconds")
        print(f"cuml Best parameters ({model_name}): {cu_models[model_name].best_params_}")
    else:
        cu_times[model_name] = 0
        print(f"Skipping cuML {model_name} fitting as X_train or y_train is not defined.")
elif has_cuml and not cuml_rf_available:
    print(f"cuML is available, but cuML {model_name} (RandomForestClassifier) is not. Skipping cuML {model_name} benchmark.")
else:
    print(f"cuML is not installed or GPU not available. Skipping cuML {model_name} benchmark.")

if X_train is None or y_train is None:
   print(f"scikit-learn GridSearchCV training time ({model_name}) : N/A (X_train or y_train not defined)")
   print(f"scikit-learn Best parameters ({model_name}): N/A (X_train or y_train not defined)")


# Prediction & Model Evaluation 
# (Commented out/Conditional as target 'y' is not defined for solar system data)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# --- Comparison and Evaluation ---
model_names_fitted = list(sk_models.keys()) 

for model_name in model_names_fitted:
    print(f"--- Evaluating: {model_name} ---")

    # --- scikit-learn model ---
    print("\n-- scikit-learn --")
    if model_name in sk_models and hasattr(sk_models[model_name], 'best_estimator_') and y_train is not None and y_test is not None:
        best_sk_model = sk_models[model_name].best_estimator_
        # Predictions
        y_pred_train_sk = best_sk_model.predict(X_train)
        y_pred_test_sk = best_sk_model.predict(X_test)
        start_time_pred_sk = time.time()
        sk_pred[model_name] = best_sk_model.predict(X) # Predict on whole X for plotting
        sk_pred_times[model_name] = time.time() - start_time_pred_sk
        print(f"Prediction Duration: {sk_pred_times[model_name]:.2f} seconds")
        # Accuracy
        train_accuracy_sk = accuracy_score(y_train, y_pred_train_sk)
        test_accuracy_sk = accuracy_score(y_test, y_pred_test_sk)
        sk_train_accuracy[model_name] = train_accuracy_sk
        sk_test_accuracy[model_name] = test_accuracy_sk
        print(f"Train Accuracy: {train_accuracy_sk:.4f}")
        print(f"Test Accuracy: {test_accuracy_sk:.4f}")
        # Cross-validation score
        cv_score_sk = sk_models[model_name].best_score_
        sk_crossValidation_accuracy[model_name] = cv_score_sk
        print(f"Best Cross-Validation Score: {cv_score_sk:.4f}")
        # Duration
        print(f"Training Duration: {sk_times.get(model_name, 'N/A'):.2f} seconds")
        # Confusion Matrix
        print("Confusion Matrix (Test Set):")
        cm_sk = confusion_matrix(y_test, y_pred_test_sk)
        disp_sk = ConfusionMatrixDisplay(confusion_matrix=cm_sk, display_labels=best_sk_model.classes_)
        disp_sk.plot()
        plt.title(f"scikit-learn {model_name} - Confusion Matrix")
        plt.show()
    elif y_train is None or y_test is None:
        print(f"Skipping evaluation for scikit-learn {model_name} as y_train or y_test is not defined.")
        sk_pred[model_name] = None # Ensure key exists but is None
        sk_pred_times[model_name] = 0
    else:
        print(f"scikit-learn model {model_name} not trained or available for evaluation.")
        sk_pred[model_name] = None
        sk_pred_times[model_name] = 0

    # --- cuML model ---
    print("\n-- cuML --")
    if has_cuml and model_name in cu_models and hasattr(cu_models[model_name], 'best_estimator_') and y_train is not None and y_test is not None:
        best_cu_model = cu_models[model_name].best_estimator_
        # Predictions
        X_train_np = X_train.to_numpy() if hasattr(X_train, 'to_numpy') else X_train
        X_test_np = X_test.to_numpy() if hasattr(X_test, 'to_numpy') else X_test
        X_np = X.to_numpy() if hasattr(X, 'to_numpy') else X

        y_pred_train_cu_gpu = best_cu_model.predict(X_train_np)
        y_pred_test_cu_gpu = best_cu_model.predict(X_test_np)
        start_time_pred_cu = time.time()
        cu_pred_gpu = best_cu_model.predict(X_np)
        cu_pred_times[model_name] = time.time() - start_time_pred_cu
        print(f"Prediction Duration: {cu_pred_times[model_name]:.2f} seconds")

        y_pred_train_cu = y_pred_train_cu_gpu.get() if hasattr(y_pred_train_cu_gpu, 'get') else y_pred_train_cu_gpu
        y_pred_test_cu = y_pred_test_cu_gpu.get() if hasattr(y_pred_test_cu_gpu, 'get') else y_pred_test_cu_gpu
        cu_pred[model_name] = cu_pred_gpu.get() if hasattr(cu_pred_gpu, 'get') else cu_pred_gpu
        
        # Accuracy
        train_accuracy_cu = accuracy_score(y_train, y_pred_train_cu) 
        test_accuracy_cu = accuracy_score(y_test, y_pred_test_cu)   
        cu_train_accuracy[model_name] = train_accuracy_cu
        cu_test_accuracy[model_name] = test_accuracy_cu
        print(f"Train Accuracy: {train_accuracy_cu:.4f}")
        print(f"Test Accuracy: {test_accuracy_cu:.4f}")
        # Cross-validation score
        cv_score_cu = cu_models[model_name].best_score_
        # cu_crossValidation_accuracy[f"cu_{model_name}"] = cv_score_cu # Original had f-string key
        cu_crossValidation_accuracy[model_name] = cv_score_cu # Corrected key
        print(f"Best Cross-Validation Score: {cv_score_cu:.4f}")
        # Duration
        print(f"Training Duration: {cu_times.get(model_name, 'N/A'):.2f} seconds")
        # Confusion Matrix
        print("Confusion Matrix (Test Set):")
        cm_cu = confusion_matrix(y_test, y_pred_test_cu)
        disp_cu = ConfusionMatrixDisplay(confusion_matrix=cm_cu, display_labels=best_cu_model.classes_)
        disp_cu.plot()
        plt.title(f"cuML {model_name} - Confusion Matrix")
        plt.show()
    elif has_cuml and (y_train is None or y_test is None):
        print(f"Skipping evaluation for cuML {model_name} as y_train or y_test is not defined.")
        cu_pred[model_name] = None
        cu_pred_times[model_name] = 0
    elif has_cuml:
        print(f"cuML model {model_name} not trained or available for evaluation.")
        cu_pred[model_name] = None
        cu_pred_times[model_name] = 0
    else:
        print(f"cuML {model_name} was not run as cuML is not available.")
        cu_pred[model_name] = None
        cu_pred_times[model_name] = 0

    print("\n" + "="*40 + "\n")

print("\nSummary of Accuracies and Times (if models were trained and y was defined):")
print("scikit-learn Train Accuracies:", sk_train_accuracy)
print("CuML Train Accuracies:", cu_train_accuracy)
print("scikit-learn Test Accuracies:", sk_test_accuracy)
print("CuML Test Accuracies:", cu_test_accuracy)
print("scikit-learn Cross-Validation Accuracies:", sk_crossValidation_accuracy)
print("CuML Cross-Validation Accuracies:", cu_crossValidation_accuracy)
print("scikit-learn Training Times:", sk_times)
print("cuML Training Times:", cu_times)
print("scikit-learn Prediction Times:", sk_pred_times)
print("cuML Prediction Times:", cu_pred_times)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import pandas as pd
import numpy as np

# This cell handles plotting. It will attempt to use a 'Date'/'Time' derived axis or fallback to index.
# Classification tracks are commented out as 'y' is not defined for the solar system data.

# TODO: Define a suitable 'y_axis_plot' and 'y_axis_label' for the new dataset.
# Example: If 'Date' and 'Time' columns exist and can form a datetime index:
if not generate_mock_data and 'Date' in data.columns and 'Time' in data.columns:
    try:
        # Attempt to parse Date and Time into a single datetime series
        datetime_series = pd.to_datetime(data['Date'] + ' ' + data['Time'], errors='coerce')
        data_for_plot = data.loc[datetime_series.sort_values().index].copy() # Use .copy() for a new DataFrame
        X_for_plot = X.loc[datetime_series.sort_values().index].copy()
        # For plotting, a simple numerical sequence based on sorted time might be best if time intervals are not uniform
        y_axis_plot = np.arange(len(data_for_plot))
        y_axis_label = "Time Sequence Index"
        print("Using Time Sequence Index for y-axis of plots.")
    except Exception as e:
        print(f"Could not create datetime series from 'Date' and 'Time': {e}. Using DataFrame index for y-axis.")
        data_for_plot = data.copy()
        X_for_plot = X.copy()
        y_axis_plot = data_for_plot.index # Fallback to index
        y_axis_label = "Index"
elif generate_mock_data and 'Y_AXIS_MOCK' in data.columns:
    data_for_plot = data.copy()
    X_for_plot = X.copy()
    y_axis_plot = data_for_plot['Y_AXIS_MOCK']
    y_axis_label = "Mock Y Axis"
    print("Using Mock Y Axis for y-axis of plots.")
else:
    print("Warning: 'Date' and 'Time' columns not found or using mock data without Y_AXIS_MOCK. Using DataFrame index for y-axis of feature tracks.")
    data_for_plot = data.copy()
    X_for_plot = X.copy()
    y_axis_plot = data_for_plot.index # Fallback to index
    y_axis_label = "Index"

feature_names_plot = X_for_plot.columns.tolist()

# --- Figure 1: Feature Tracks ---
num_feature_plots = len(feature_names_plot)
if num_feature_plots > 0:
    fig_features, axes_features = plt.subplots(1, num_feature_plots, figsize=(num_feature_plots * 2.5, 10), sharey=True)
    if num_feature_plots == 1: axes_features = [axes_features] # Ensure axes_features is always a list
    fig_features.suptitle("Feature Tracks", fontsize=16, y=0.98)

    for i, feature_name in enumerate(feature_names_plot):
        ax = axes_features[i]
        ax.plot(data_for_plot[feature_name], y_axis_plot)
        ax.set_title(feature_name, fontsize=10)
        ax.set_xlabel("Value", fontsize=8)
        if i == 0:
            ax.set_ylabel(y_axis_label, fontsize=10)
        ax.grid(True, linestyle='--', alpha=0.7)
        ax.tick_params(axis='x', labelsize=8)
        ax.tick_params(axis='y', labelsize=8)

    if num_feature_plots > 0 : axes_features[0].invert_yaxis() # Invert if it makes sense (e.g. time sequence or depth)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()
else:
    print("No features to plot for feature tracks.")

# --- Figure 2: Scaled Feature Tracks ---
if num_feature_plots > 0:
    fig_scaled_features, axes_scaled_features = plt.subplots(1, num_feature_plots, figsize=(num_feature_plots * 2.5, 10), sharey=True)
    if num_feature_plots == 1: axes_scaled_features = [axes_scaled_features]
    fig_scaled_features.suptitle("Scaled Feature Tracks (After Quantile Transformation)", fontsize=16, y=0.98)

    for i, feature_name in enumerate(feature_names_plot):
        ax = axes_scaled_features[i]
        ax.plot(X_for_plot[feature_name], y_axis_plot)
        ax.set_title(feature_name, fontsize=10)
        ax.set_xlabel("Value", fontsize=8)
        if i == 0:
            ax.set_ylabel(y_axis_label, fontsize=10)
        ax.grid(True, linestyle='--', alpha=0.7)
        ax.tick_params(axis='x', labelsize=8)
        ax.tick_params(axis='y', labelsize=8)

    if num_feature_plots > 0 : axes_scaled_features[0].invert_yaxis()
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()
else:
    print("No features to plot for scaled feature tracks.")

# --- Figure 3: Classification Tracks (Commented out) ---
print("\nClassification Tracks plotting is skipped as 'y' is not defined for the solar system data or task is not classification.")
# if y is not None and not y.empty: # Check if y is defined and not empty
#     all_predictions_to_plot = {}
#     all_predictions_to_plot['True Labels'] = y.loc[X_for_plot.index] # Align with X_for_plot for consistent y-axis
#     for model_name, preds_array in sk_pred.items():
#         if preds_array is not None:
#             all_predictions_to_plot[f'SK: {model_name}'] = pd.Series(preds_array, index=X.index).loc[X_for_plot.index]
#     if has_cuml and cu_pred:
#         for model_name, preds_cu_array in cu_pred.items():
#             if preds_cu_array is not None:
#                 all_predictions_to_plot[f'CU: {model_name}'] = pd.Series(preds_cu_array, index=X.index).loc[X_for_plot.index]
#
#     num_classification_plots = len(all_predictions_to_plot)
#     if num_classification_plots > 0:
#         fig_class, axes_class = plt.subplots(1, num_classification_plots, figsize=(num_classification_plots * 1.2, 10), sharey=True)
#         if num_classification_plots == 1: axes_class = [axes_class]
#         fig_class.suptitle("Classification Tracks", fontsize=16, y=0.98)
#
#         plot_order = ['True Labels'] + [k for k in all_predictions_to_plot.keys() if k != 'True Labels']
#
#         unique_classes = np.sort(y.unique())
#         if len(unique_classes) == 1: plot_colors = ['gold']
#         elif len(unique_classes) == 2: plot_colors = ['gold', 'darkorange']
#         else: 
#             cmap_tab10 = plt.cm.get_cmap('tab10', len(unique_classes))
#             plot_colors = [mcolors.to_hex(cmap_tab10(i)) for i in range(len(unique_classes))]
#         class_to_int = {cls: i for i, cls in enumerate(unique_classes)}
#         cmap_listed = mcolors.ListedColormap(plot_colors[:len(unique_classes)])
#
#         for i, title in enumerate(plot_order):
#             predictions_data = all_predictions_to_plot[title]
#             ax = axes_class[i]
#             mapped_preds = predictions_data.map(class_to_int).fillna(-1) # Use aligned predictions_data
#             labels_int = mapped_preds.values.reshape(-1, 1)
#             vmin = 0
#             vmax = len(plot_colors) - 1
#
#             ax.imshow(labels_int, aspect='auto', cmap=cmap_listed,
#                       extent=[0, 1, y_axis_plot.max(), y_axis_plot.min()], # Use y_axis_plot for extent
#                       interpolation='none', vmin=vmin, vmax=vmax)
#
#             ax.set_title(title, fontsize=10)
#             ax.set_xticks([])
#             ax.set_xlabel("")
#             ax.tick_params(axis='y', labelsize=8)
#             if i == 0: ax.set_ylabel(y_axis_label, fontsize=10)
#
#         if num_classification_plots > 0: axes_class[0].invert_yaxis()
#
#         if len(unique_classes) > 0:
#             patches = [plt.Rectangle((0,0),1,1, color=plot_colors[class_to_int[cls]]) for cls in unique_classes]
#             legend_labels = [str(cls) for cls in unique_classes]
#             fig_class.legend(patches, legend_labels, loc='lower center', ncol=len(unique_classes), bbox_to_anchor=(0.5, -0.02), title="Classes")
#
#         plt.tight_layout(rect=[0, 0.03, 1, 0.93])
#         plt.show()
# else:
#     print("No classification data to plot (y is None or empty).")


# Shutdown colab runtime

In [None]:
# Execution time
end_notebook = time.time()
print(f"Total notebook execution time: {end_notebook - start_notebook:.2f} seconds")

In [None]:
if is_running_on_colab:
    from google.colab import runtime
    runtime.unassign()