# Retrieve data

In [1]:
try:
    from google.colab import drive
    is_running_on_colab = True
except ImportError:
    is_running_on_colab = False

if is_running_on_colab:
    # Mount Google Drive
    drive.mount('/content/drive')


# Data Preparation

In [2]:

import time
start_notebook = time.time()
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
# Option to generate mock data
generate_mock_data = False # Set to True to generate mock data
max_mock_depth_ft = 2000 # Define maximum depth for mock data
well_name = "LLB-10"
if not generate_mock_data:
  if is_running_on_colab:
    # Load data from Google Drive, if running on Google Colab
    colab_repo_dir = "/content/drive/MyDrive/riset-fttm-gdrive/cuml-tf-model-hydrocarbon-prediction"
    data = pd.read_csv(f"{colab_repo_dir}/data/interpreted/interpreted_{well_name}.csv", sep=',')
  else:
    # Load data from local directory
    data = pd.read_csv(f"./data/interpreted/interpreted_{well_name}.csv", sep=',')
if generate_mock_data:
    print(f"Generating mock data up to {max_mock_depth_ft} ft for well {well_name}...")
    mock_depth_step = 0.5
    mock_dept_values = np.arange(0, max_mock_depth_ft, mock_depth_step)
    num_mock_rows = len(mock_dept_values)

    mock_data_dict = {'DEPT': mock_dept_values}

    feature_cols_for_mock = ['CALI','DRHO','GR','MR','NPHI_corr','PEF','RHOB_CORR','ROP']

    for col in feature_cols_for_mock:
        mock_data_dict[col] = np.random.rand(num_mock_rows) * 100

    mock_data_dict['hydrocarbon_formation_class'] = np.random.randint(0, 2, num_mock_rows)

    data = pd.DataFrame(mock_data_dict)

    print(f"Mock data generated for well {well_name} with {num_mock_rows} rows and columns: {list(data.columns)}.")
    print("Mock data head:")
    print(data.head())

In [4]:
df=data[['DRHO','GR','MR','NPHI_corr','PEF','RHOB_CORR']]

In [5]:
df

Unnamed: 0,DRHO,GR,MR,NPHI_corr,PEF,RHOB_CORR
0,0.051,88.200,0.875,0.4841,2.570,2.103
1,0.050,85.650,0.874,0.4744,2.582,2.130
2,0.064,79.358,0.900,0.4845,2.594,2.177
3,0.077,74.004,0.917,0.5475,2.613,2.184
4,0.081,78.938,0.973,0.6065,2.660,2.142
...,...,...,...,...,...,...
7103,0.206,48.022,1.943,0.3222,4.039,2.595
7104,0.158,51.742,1.981,0.3199,3.960,2.555
7105,0.089,54.041,2.015,0.3278,3.849,2.477
7106,0.052,52.710,1.962,0.3315,3.949,2.432


## Train/Test Splitting

In [6]:
# Misalkan 'data' adalah DataFrame Anda dan 'df' adalah fitur yang telah Anda ekstrak
X = df  # Fitur
y = data['hydrocarbon_formation_class']  # Label

# Split data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [7]:
X_train.describe()

Unnamed: 0,DRHO,GR,MR,NPHI_corr,PEF,RHOB_CORR
count,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0
mean,0.153765,60.130491,1.433918,0.410088,2.969691,2.346557
std,0.103196,11.080574,1.067113,0.052479,0.702442,0.109542
min,-0.196,17.78,0.561,0.1367,-0.202,1.6204
25%,0.091,52.84475,1.09,0.3829,2.71825,2.288
50%,0.134,62.1435,1.226,0.4085,2.928,2.353
75%,0.187,68.1295,1.433,0.442,3.14975,2.41
max,0.93,90.688,24.142,0.6239,16.001,3.433


## Data Imbalance Handling

In [8]:
from imblearn.over_sampling import SMOTE

In [9]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
# X_train and y_train are available from CELL INDEX 10
print("Original training data shape:", X_train.shape, y_train.shape)
print("Original training class distribution:\n", y_train.value_counts())

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\nShape of training data after SMOTE:", X_train_smote.shape, y_train_smote.shape)
print("Class distribution after SMOTE:\n", y_train_smote.value_counts())

# Update X_train and y_train to be the oversampled versions
X_train = X_train_smote
y_train = y_train_smote

Original training data shape: (5686, 6) (5686,)
Original training class distribution:
 hydrocarbon_formation_class
0    5326
1     360
Name: count, dtype: int64

Shape of training data after SMOTE: (10652, 6) (10652,)
Class distribution after SMOTE:
 hydrocarbon_formation_class
0    5326
1    5326
Name: count, dtype: int64


## Apply Quantile Transformation

In [10]:
from sklearn.preprocessing import QuantileTransformer
def transform_quantile(X_train, X_test, X):
    qt_transformer = QuantileTransformer(output_distribution='normal')
    dfs = [X_train, X_test, X]
    qt_dfs = [None,None,None]
    for i, df in enumerate(dfs):
        if (i == 0): #only perform fit_transform on training data
            qt_dfs[i] = pd.DataFrame(qt_transformer.fit_transform(df))
        else:
            qt_dfs[i] = pd.DataFrame(qt_transformer.transform(df))
        qt_dfs[i].columns = df.columns.values
        qt_dfs[i].index = df.index.values
    return qt_dfs[0], qt_dfs[1], qt_dfs[2] #X_train, X_test, X

In [11]:
X_train, X_test, X = transform_quantile(X_train, X_test, X)

## Feature Scaling

karena menggunakan Quatile transformation dengan output gaussian, masing masing kolom secara otomatis ditransformasi ke distribusi normal baku, atau distribusi normal dengan rataan nol dan standar deviasi 1, oleh karena itu tidak diperlukan tambahan scaling.

In [12]:
X_train.describe()

Unnamed: 0,DRHO,GR,MR,NPHI_corr,PEF,RHOB_CORR
count,10652.0,10652.0,10652.0,10652.0,10652.0,10652.0
mean,0.000753,-0.000829,0.002739,-0.001097,0.001873,0.003177
std,0.998325,1.000664,1.001228,1.000984,1.003524,0.998344
min,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
25%,-0.673498,-0.67125,-0.671135,-0.673741,-0.675934,-0.668763
50%,0.001318,-0.001634,0.004335,0.001822,-0.000237,0.005675
75%,0.676854,0.666379,0.678364,0.672159,0.680013,0.673702
max,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338


# Training setup

In [13]:
import time
import numpy as np
from sklearn.model_selection import GridSearchCV


sk_train_accuracy={}
sk_test_accuracy={}

cu_train_accuracy={}
cu_test_accuracy={}

sk_crossValidation_accuracy={}
cu_crossValidation_accuracy={}

sk_models = {} #sklearn models
cu_models = {} #cuml models

sk_times = {}
cu_times = {}

sk_pred = {}
cu_pred = {}

sk_pred_times = {}
cu_pred_times = {}

In [14]:
# test CuML availability & is working
try:
    import cuml
    kmeans = cuml.KMeans(n_clusters=2)
    # Create minimal dummy data
    dummy_data = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8]])
    kmeans.fit(dummy_data)
    # Attempt to fit the model
    has_cuml = True
    print("cuML is found and working")
except ImportError:
    has_cuml = False
    print("cuML not found. Please ensure cuML is installed.")
except Exception as e:
    has_cuml = False
    print(f"cuML couldn't be initialized or used. Error: {e}")


cuML not found. Please ensure cuML is installed.


# Models

## SVM

In [15]:
model_name = "SVM"

In [16]:
from sklearn.svm import SVC as SklearnSVC
# Attempt to import cuML's SVC
try:
    from cuml.svm import SVC as cuMLSVC
except ImportError:
    print("cuML SVC not available. Please ensure cuML is installed and compatible with your environment.")

cuML SVC not available. Please ensure cuML is installed and compatible with your environment.


In [17]:
# Parameter grid for both models
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

# 1) scikit-learn SVM with GridSearchCV
sk_models[model_name] = GridSearchCV(
    estimator=SklearnSVC(kernel='rbf'),
    param_grid=param_grid,
    cv=5,
    verbose=3,
    n_jobs=-1
)

time_start = time.time()
sk_models[model_name].fit(X_train, y_train)
time_end = time.time()
sk_times[model_name] = time_end - time_start

# 2) cuML SVM with the same GridSearchCV
if has_cuml:
    cu_models[model_name] = GridSearchCV(
        estimator=cuMLSVC(kernel='rbf'),
        param_grid=param_grid,
        cv=5,
        verbose=3,
        # Note: cuML estimator runs on GPU; this grid search runs on CPU orchestrating GPU calls
        n_jobs=1  # avoid multiprocessing issues with GPU
    )

    time_start = time.time()
    cu_models[model_name].fit(X_train, y_train)
    time_end = time.time()
    cu_times[model_name] = time_end - time_start
    print(f"cuml GridSearchCV training time ({model_name}) : {cu_times[model_name]:.2f} seconds")
    print(f"cuml Best parameters ({model_name}): {cu_models[model_name].best_params_}")
else:
    print("cuML is not installed or GPU not available. Please install RAPIDS cuML to run this benchmark.")

print(f"scikit-learn GridSearchCV training time ({model_name}) : {sk_times[model_name]:.2f} seconds")
print(f"scikit-learn Best parameters ({model_name}): {sk_models[model_name].best_params_}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
cuML is not installed or GPU not available. Please install RAPIDS cuML to run this benchmark.
scikit-learn GridSearchCV training time (SVM) : 5.80 seconds
scikit-learn Best parameters (SVM): {'C': 10, 'gamma': 'auto'}
cuML is not installed or GPU not available. Please install RAPIDS cuML to run this benchmark.
scikit-learn GridSearchCV training time (SVM) : 5.80 seconds
scikit-learn Best parameters (SVM): {'C': 10, 'gamma': 'auto'}


## K-Nearest Neighbors

In [18]:
model_name = "KNN"

In [19]:
from sklearn.neighbors import KNeighborsClassifier as SklearnKNeighborsClassifier
# Attempt to import cuML's KNeighborsClassifier
try:
    from cuml.neighbors import KNeighborsClassifier as cuMLKNeighborsClassifier
    # has_cuml is already defined from SVM section, assuming if SVM cuml is available, KNN cuml is too.
except ImportError:
    # If cuML was previously found but KNN specific part is missing, update has_cuml for KNN context if necessary
    # For simplicity, we rely on the initial has_cuml check. If specific components are missing,
    # the cuML KNN block will be skipped or error out, which is acceptable.
    # A more robust check could be:
    # try: from cuml.svm import SVC as cuMLSVC; has_cuml_svm = True except: has_cuml_svm = False
    # try: from cuml.neighbors import KNeighborsClassifier as cuMLKNeighborsClassifier; has_cuml_knn = True except: has_cuml_knn = False
    pass # Rely on global has_cuml

In [20]:
# Parameter grid for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# 1) scikit-learn KNN with GridSearchCV
sk_models[model_name] = GridSearchCV(
    estimator=SklearnKNeighborsClassifier(),
    param_grid=param_grid_knn,
    cv=5,
    verbose=3,
    n_jobs=-1
)

time_start = time.time()
sk_models[model_name].fit(X_train, y_train)
time_end = time.time()
sk_times[model_name] = time_end - time_start

# 2) cuML KNN with the same GridSearchCV (adapted for cuML)
if has_cuml:
    try:
        # Ensure cuMLKNeighborsClassifier was imported
        cuMLKNeighborsClassifier

        cu_models[model_name] = GridSearchCV(
            estimator=cuMLKNeighborsClassifier(), # cuML KNN
            param_grid=param_grid_knn, # Using the same grid, ensure params are compatible
            cv=5,
            verbose=3,
            n_jobs=1 # Potentially can be >1 if managed carefully, but 1 is safer for GPU resources with GridSearchCV
        )

        time_start = time.time()
        # cuML's KNN might prefer numpy arrays or cuDF dataframes
        # X_train and y_train are pandas, which cuml usually handles.
        # If issues arise, convert: X_train_cu = X_train.to_numpy(), y_train_cu = y_train.to_numpy()
        cu_models[model_name].fit(X_train, y_train)
        time_end = time.time()
        cu_times[model_name] = time_end - time_start
        print(f"cuml GridSearchCV training time ({model_name}) : {cu_times[model_name]:.2f} seconds")
        print(f"cuml Best parameters ({model_name}): {cu_models[model_name].best_params_}")
    except NameError: # handles if cuMLKNeighborsClassifier was not imported
        print(f"cuML KNeighborsClassifier not available. Skipping cuML {model_name} training.")
    except Exception as e:
        print(f"An error occurred during cuML {model_name} training: {e}")
        # Optionally, remove the model key if setup failed partway
        if model_name in cu_models: del cu_models[model_name]
        if model_name in cu_times: del cu_times[model_name]

else:
    print(f"cuML is not installed or GPU not available. Skipping cuML {model_name} benchmark.")

print(f"scikit-learn GridSearchCV training time ({model_name}) : {sk_times[model_name]:.2f} seconds")
print(f"scikit-learn Best parameters ({model_name}): {sk_models[model_name].best_params_}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
cuML is not installed or GPU not available. Skipping cuML KNN benchmark.
scikit-learn GridSearchCV training time (KNN) : 0.64 seconds
scikit-learn Best parameters (KNN): {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
cuML is not installed or GPU not available. Skipping cuML KNN benchmark.
scikit-learn GridSearchCV training time (KNN) : 0.64 seconds
scikit-learn Best parameters (KNN): {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


## Random Forest

In [21]:
model_name = "RF"

In [22]:
from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier
# Attempt to import cuML's RandomForestClassifier
try:
    from cuml.ensemble import RandomForestClassifier as cuMLRandomForestClassifier
    # has_cuml is already defined, assuming if previous cuML models are available, RF is too.
except ImportError:
    # Pass, relying on global has_cuml. Specific check for cuMLRandomForestClassifier will happen in the training block.
    pass

In [23]:
# Parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200], # Number of trees in the forest
    'max_depth': [None, 10, 20],    # Maximum depth of the tree
    'min_samples_split': [2, 5], # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],   # Minimum number of samples required to be at a leaf node
    # 'max_features': ['auto', 'sqrt'] # Number of features to consider when looking for the best split (cuML might have different defaults/options)
    # For cuML compatibility, specific parameters like 'max_features' might need adjustment or careful selection.
    # cuML RandomForestClassifier has slightly different parameter names or accepted values for some arguments.
    # e.g. max_features in cuML can be int, float, or 'auto' (auto is sqrt(n_features)).
}

# 1) scikit-learn RandomForest with GridSearchCV
sk_models[model_name] = GridSearchCV(
    estimator=SklearnRandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    cv=5, # Using 5-fold CV for RF due to potentially longer training times
    verbose=3,
    n_jobs=-1
)

time_start = time.time()
sk_models[model_name].fit(X_train, y_train)
time_end = time.time()
sk_times[model_name] = time_end - time_start


# 2) cuML RandomForest with GridSearchCV (adapted for cuML)
if has_cuml:
    try:
        # Ensure cuMLRandomForestClassifier was imported
        cuMLRandomForestClassifier

        # cuML specific parameter adjustments if necessary.
        # For example, cuML's RandomForestClassifier might not support all string options for max_features like sklearn.
        # It typically supports int (number of features) or float (fraction of features). 'auto' is often sqrt(n_features).
        # Let's use a simplified grid or ensure compatibility.
        # The provided param_grid_rf should generally work if 'max_features' is omitted or set to a compatible value.

        cu_models[model_name] = GridSearchCV(
            estimator=cuMLRandomForestClassifier(random_state=42), # cuML RandomForest
            param_grid=param_grid_rf, # Ensure parameters are compatible with cuML RF
            cv=5, # Using 5-fold CV
            verbose=3,
            n_jobs=1 # Safer for GPU resources with GridSearchCV
        )

        time_start = time.time()
        cu_models[model_name].fit(X_train, y_train) # X_train, y_train are pandas, cuML handles this
        time_end = time.time()
        cu_times[model_name] = time_end - time_start
        print(f"cuml GridSearchCV training time ({model_name}) : {cu_times[model_name]:.2f} seconds")
        print(f"cuml Best parameters ({model_name}): {cu_models[model_name].best_params_}")
    except NameError:
        print(f"cuML RandomForestClassifier not available. Skipping cuML {model_name} training.")
    except Exception as e:
        print(f"An error occurred during cuML {model_name} training: {e}")
        if model_name in cu_models: del cu_models[model_name]
        if model_name in cu_times: del cu_times[model_name]
else:
    print(f"cuML is not installed or GPU not available. Skipping cuML {model_name} benchmark.")

print(f"scikit-learn GridSearchCV training time ({model_name}) : {sk_times[model_name]:.2f} seconds")
print(f"scikit-learn Best parameters ({model_name}): {sk_models[model_name].best_params_}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
cuML is not installed or GPU not available. Skipping cuML RF benchmark.
scikit-learn GridSearchCV training time (RF) : 24.72 seconds
scikit-learn Best parameters (RF): {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
cuML is not installed or GPU not available. Skipping cuML RF benchmark.
scikit-learn GridSearchCV training time (RF) : 24.72 seconds
scikit-learn Best parameters (RF): {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


# Model Evaluation Framework

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt

def load_well_data(well_name, generate_mock_data=False, max_mock_depth_ft=2000, is_running_on_colab=False):
    """
    Load well data for a given well name.
    
    Args:
        well_name: Name of the well (e.g., 'LLD-14', 'LLB-10')
        generate_mock_data: Boolean to generate mock data
        max_mock_depth_ft: Maximum depth for mock data
        is_running_on_colab: Boolean indicating if running on Google Colab
    
    Returns:
        DataFrame with well data
    """
    if not generate_mock_data:
        if is_running_on_colab:
            colab_repo_dir = "/content/drive/MyDrive/riset-fttm-gdrive/cuml-tf-model-hydrocarbon-prediction"
            data = pd.read_csv(f"{colab_repo_dir}/data/interpreted/interpreted_{well_name}.csv", sep=',')
        else:
            data = pd.read_csv(f"./data/interpreted/interpreted_{well_name}.csv", sep=',')
    else:
        print(f"Generating mock data up to {max_mock_depth_ft} ft for well {well_name}...")
        mock_depth_step = 0.5
        mock_dept_values = np.arange(0, max_mock_depth_ft, mock_depth_step)
        num_mock_rows = len(mock_dept_values)

        mock_data_dict = {'DEPT': mock_dept_values}
        feature_cols_for_mock = ['CALI','DRHO','GR','MR','NPHI_corr','PEF','RHOB_CORR','ROP']

        for col in feature_cols_for_mock:
            mock_data_dict[col] = np.random.rand(num_mock_rows) * 100

        mock_data_dict['hydrocarbon_formation_class'] = np.random.randint(0, 2, num_mock_rows)
        data = pd.DataFrame(mock_data_dict)

    return data

def prepare_well_features(data, feature_columns=['DRHO','GR','MR','NPHI_corr','PEF','RHOB_CORR']):
    """
    Extract features and labels from well data.
    
    Args:
        data: DataFrame with well data
        feature_columns: List of feature column names
    
    Returns:
        Tuple of (features_df, labels_series)
    """
    features = data[feature_columns]
    labels = data['hydrocarbon_formation_class']
    return features, labels

def apply_quantile_transformation_to_well(well_features, qt_transformer):
    """
    Apply quantile transformation to well features using pre-fitted transformer.
    
    Args:
        well_features: DataFrame with well features
        qt_transformer: Pre-fitted QuantileTransformer
    
    Returns:
        DataFrame with transformed features
    """
    transformed_features = pd.DataFrame(qt_transformer.transform(well_features))
    transformed_features.columns = well_features.columns.values
    transformed_features.index = well_features.index.values
    return transformed_features

def evaluate_model_predictions(model, X_data, y_true, model_name, library_name, well_name, dataset_type):
    """
    Evaluate model predictions and return metrics.
    
    Args:
        model: Trained model
        X_data: Feature data
        y_true: True labels
        model_name: Name of the model (e.g., 'SVM', 'KNN')
        library_name: Library name ('scikit-learn' or 'cuML')
        well_name: Name of the well
        dataset_type: Type of dataset ('train', 'test', 'external')
    
    Returns:
        Dictionary with evaluation metrics and predictions
    """
    # Make predictions
    start_time = time.time()
    if hasattr(X_data, 'to_numpy') and library_name == 'cuML':
        y_pred = model.predict(X_data.to_numpy())
    else:
        y_pred = model.predict(X_data)
    end_time = time.time()
    
    # Convert cuPy arrays to NumPy if necessary
    if hasattr(y_pred, 'get'):
        y_pred = y_pred.get()
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    pred_time = end_time - start_time
    
    # Generate classification report
    class_report = classification_report(y_true, y_pred, output_dict=True)
    
    # Generate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    return {
        'predictions': y_pred,
        'accuracy': accuracy,
        'prediction_time': pred_time,
        'classification_report': class_report,
        'confusion_matrix': cm,
        'model_name': model_name,
        'library_name': library_name,
        'well_name': well_name,
        'dataset_type': dataset_type
    }

def plot_confusion_matrix(evaluation_result):
    """
    Plot confusion matrix for evaluation result.
    
    Args:
        evaluation_result: Dictionary from evaluate_model_predictions
    """
    cm = evaluation_result['confusion_matrix']
    model_name = evaluation_result['model_name']
    library_name = evaluation_result['library_name']
    well_name = evaluation_result['well_name']
    dataset_type = evaluation_result['dataset_type']
    
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot()
    plt.title(f"{library_name} {model_name} - {well_name} ({dataset_type}) - Confusion Matrix")
    plt.show()

def print_evaluation_summary(evaluation_result):
    """
    Print summary of evaluation results.
    
    Args:
        evaluation_result: Dictionary from evaluate_model_predictions
    """
    print(f"Model: {evaluation_result['library_name']} {evaluation_result['model_name']}")
    print(f"Well: {evaluation_result['well_name']} ({evaluation_result['dataset_type']})")
    print(f"Accuracy: {evaluation_result['accuracy']:.4f}")
    print(f"Prediction Duration: {evaluation_result['prediction_time']:.4f} seconds")
    print("Classification Report:")
    class_report = evaluation_result['classification_report']
    for class_name, metrics in class_report.items():
        if isinstance(metrics, dict):
            print(f"  Class {class_name}: Precision={metrics['precision']:.3f}, Recall={metrics['recall']:.3f}, F1={metrics['f1-score']:.3f}")
    print("-" * 40)

In [None]:
# Store the quantile transformer for later use
# Re-create and fit the transformer to ensure it's available
from sklearn.preprocessing import QuantileTransformer
qt_transformer = QuantileTransformer(output_distribution='normal')
qt_transformer.fit(X_train)

print("Quantile transformer fitted and stored for evaluation.")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# Model Evaluation on Training and Test Sets

In [None]:
# Evaluate all models on training and test sets
evaluation_results = []

for model_name in sk_models.keys():
    # Evaluate scikit-learn models
    if model_name in sk_models:
        # Training set evaluation
        train_result_sk = evaluate_model_predictions(
            sk_models[model_name], X_train, y_train, 
            model_name, 'scikit-learn', 'LLB-10', 'train'
        )
        evaluation_results.append(train_result_sk)
        
        # Test set evaluation
        test_result_sk = evaluate_model_predictions(
            sk_models[model_name], X_test, y_test, 
            model_name, 'scikit-learn', 'LLB-10', 'test'
        )
        evaluation_results.append(test_result_sk)
        
        print_evaluation_summary(train_result_sk)
        print_evaluation_summary(test_result_sk)
    
    # Evaluate cuML models if available
    if has_cuml and model_name in cu_models:
        # Training set evaluation
        train_result_cu = evaluate_model_predictions(
            cu_models[model_name], X_train, y_train, 
            model_name, 'cuML', 'LLB-10', 'train'
        )
        evaluation_results.append(train_result_cu)
        
        # Test set evaluation
        test_result_cu = evaluate_model_predictions(
            cu_models[model_name], X_test, y_test, 
            model_name, 'cuML', 'LLB-10', 'test'
        )
        evaluation_results.append(test_result_cu)
        
        print_evaluation_summary(train_result_cu)
        print_evaluation_summary(test_result_cu)

TypeError: 'NoneType' object is not subscriptable

# External Well Evaluation (LLD-14)

In [None]:
# Load and evaluate on external well LLD-14
external_well_name = "LLD-14"

try:
    # Load external well data
    external_data = load_well_data(
        external_well_name, 
        generate_mock_data=generate_mock_data,
        max_mock_depth_ft=max_mock_depth_ft,
        is_running_on_colab=is_running_on_colab
    )
    
    # Prepare features and labels
    external_features, external_labels = prepare_well_features(external_data)
    
    # Apply the same quantile transformation
    external_features_transformed = apply_quantile_transformation_to_well(
        external_features, qt_transformer
    )
    
    print(f"External well {external_well_name} data loaded successfully.")
    print(f"Shape: {external_features_transformed.shape}")
    print(f"Class distribution: {external_labels.value_counts()}")
    
    # Evaluate all models on external well
    for model_name in sk_models.keys():
        # Evaluate scikit-learn models
        if model_name in sk_models:
            external_result_sk = evaluate_model_predictions(
                sk_models[model_name], external_features_transformed, external_labels,
                model_name, 'scikit-learn', external_well_name, 'external'
            )
            evaluation_results.append(external_result_sk)
            print_evaluation_summary(external_result_sk)
        
        # Evaluate cuML models if available
        if has_cuml and model_name in cu_models:
            external_result_cu = evaluate_model_predictions(
                cu_models[model_name], external_features_transformed, external_labels,
                model_name, 'cuML', external_well_name, 'external'
            )
            evaluation_results.append(external_result_cu)
            print_evaluation_summary(external_result_cu)

except Exception as e:
    print(f"Error loading or evaluating external well {external_well_name}: {e}")
    print("Continuing with available evaluations...")

# Performance Comparison and Visualization

In [None]:
# Create comprehensive performance comparison
import matplotlib.pyplot as plt

# Collect performance metrics
performance_data = []

for result in evaluation_results:
    performance_data.append({
        'Model': result['model_name'],
        'Library': result['library_name'],
        'Well': result['well_name'],
        'Dataset': result['dataset_type'],
        'Accuracy': result['accuracy'],
        'Prediction_Time': result['prediction_time'],
        'F1_Score': result['classification_report']['weighted avg']['f1-score']
    })

performance_df = pd.DataFrame(performance_data)
print("Performance Summary:")
print(performance_df.round(4))

# Plot accuracy comparison
plt.figure(figsize=(15, 8))

# Subplot 1: Accuracy by Model and Library
plt.subplot(2, 2, 1)
for dataset in ['train', 'test', 'external']:
    subset = performance_df[performance_df['Dataset'] == dataset]
    if not subset.empty:
        x_labels = [f"{row['Model']}\n{row['Library']}" for _, row in subset.iterrows()]
        plt.scatter(x_labels, subset['Accuracy'], label=dataset, alpha=0.7, s=100)

plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)

# Subplot 2: Training Time Comparison
plt.subplot(2, 2, 2)
training_times = []
model_labels = []

for model_name in sk_models.keys():
    if model_name in sk_times:
        training_times.append(sk_times[model_name])
        model_labels.append(f"{model_name}\nscikit-learn")
    
    if has_cuml and model_name in cu_times:
        training_times.append(cu_times[model_name])
        model_labels.append(f"{model_name}\ncuML")

if training_times:  # Only plot if we have data
    plt.bar(model_labels, training_times, color=['skyblue', 'orange'] * len(sk_models))
    plt.title('Training Time Comparison')
    plt.ylabel('Training Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)

# Subplot 3: Prediction Time Comparison
plt.subplot(2, 2, 3)
test_data = performance_df[performance_df['Dataset'] == 'test']
if not test_data.empty:
    x_labels = [f"{row['Model']}\n{row['Library']}" for _, row in test_data.iterrows()]
    plt.bar(x_labels, test_data['Prediction_Time'], color='lightgreen')
    plt.title('Prediction Time on Test Set')
    plt.ylabel('Prediction Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)

# Subplot 4: F1-Score Comparison
plt.subplot(2, 2, 4)
for dataset in ['train', 'test', 'external']:
    subset = performance_df[performance_df['Dataset'] == dataset]
    if not subset.empty:
        x_labels = [f"{row['Model']}\n{row['Library']}" for _, row in subset.iterrows()]
        plt.scatter(x_labels, subset['F1_Score'], label=dataset, alpha=0.7, s=100)

plt.title('F1-Score Comparison')
plt.ylabel('F1-Score')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Model Performance Summary

In [None]:
# Generate final summary report
print("="*80)
print("FINAL MODEL PERFORMANCE SUMMARY")
print("="*80)

# Training times summary
print("\n📊 TRAINING TIMES:")
print("-" * 40)
for model_name in sk_models.keys():
    if model_name in sk_times:
        print(f"{model_name} (scikit-learn): {sk_times[model_name]:.2f} seconds")
    if has_cuml and model_name in cu_times:
        print(f"{model_name} (cuML): {cu_times[model_name]:.2f} seconds")
        if model_name in sk_times:
            speedup = sk_times[model_name] / cu_times[model_name]
            print(f"  → cuML speedup: {speedup:.2f}x")

# Best performing models
print("\n🏆 BEST PERFORMING MODELS:")
print("-" * 40)

# Best on test set
test_results = performance_df[performance_df['Dataset'] == 'test']
if not test_results.empty:
    best_test = test_results.loc[test_results['Accuracy'].idxmax()]
    print(f"Test Set: {best_test['Model']} ({best_test['Library']}) - Accuracy: {best_test['Accuracy']:.4f}")

# Best on external well
external_results = performance_df[performance_df['Dataset'] == 'external']
if not external_results.empty:
    best_external = external_results.loc[external_results['Accuracy'].idxmax()]
    print(f"External Well: {best_external['Model']} ({best_external['Library']}) - Accuracy: {best_external['Accuracy']:.4f}")

# Cross-validation scores (if available)
print("\n📈 CROSS-VALIDATION SCORES:")
print("-" * 40)
for model_name in sk_models.keys():
    if model_name in sk_models and hasattr(sk_models[model_name], 'best_score_'):
        print(f"{model_name} (scikit-learn): {sk_models[model_name].best_score_:.4f}")
    if has_cuml and model_name in cu_models and hasattr(cu_models[model_name], 'best_score_'):
        print(f"{model_name} (cuML): {cu_models[model_name].best_score_:.4f}")

# Total notebook execution time
end_notebook = time.time()
total_time = end_notebook - start_notebook
print(f"\n⏱️  TOTAL NOTEBOOK EXECUTION TIME: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("="*80)