In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import time
start_notebook = time.time()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cuml
import cupy as cp
import cudf

from scipy.stats import shapiro

In [None]:
well_name = "LLB-10"

In [None]:
data = pd.read_csv(f"/content/drive/MyDrive/riset-fttm-gdrive/cuml-tf-model-hydrocarbon-prediction/data/interpreted/interpreted_{well_name}.csv", sep=',')


In [None]:
df=data[['CALI','DRHO','GR','MR','NPHI_corr','PEF','RHOB_CORR','ROP']]
df

In [None]:
#Struktur Data
print("Dimensi DataFrame:", data.shape)
print("Kolom DataFrame:", data.columns)
print("Tipe Data:", data.dtypes)

In [None]:
#Memeriksa Data yang Hilang
print("\nData yang Hilang:\n", df.isnull().sum())

In [None]:
#Statistik Deskriptif
print("\nStatistik Deskriptif:\n", df.describe(include='all'))

In [None]:
for column in df.select_dtypes(include=np.number).columns:
    plt.figure(figsize=(8, 6))

    # Plot histogram
    sns.histplot(data[column], kde=False, bins=30, color='blue', alpha=0.6, label='Data Histogram')

    # Tambahkan kurva normal
    mean, std = data[column].mean(), data[column].std()
    x = np.linspace(data[column].min(), data[column].max(), 1000)
    y = norm.pdf(x, loc=mean, scale=std)
    plt.plot(x, y * len(data[column]) * (data[column].max() - data[column].min()) / 30, color='red', label='Normal Curve')

    # Label dan judul
    plt.title(f'Distribusi {column}', fontsize=14)
    plt.xlabel(column, fontsize=12)
    plt.ylabel('Frekuensi', fontsize=12)
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
shapiro_results = df.apply(lambda col: shapiro(col)[1])  # [0] adalah p-value
shapiro_results

In [None]:
#Visualisasi Data


boxplot_palette = sns.color_palette("Set2", len(df.columns))


plt.figure(figsize=(10, 6))
sns.boxplot(data=df, palette=boxplot_palette)
plt.title('Boxplot untuk Masing Masing Fitur')
plt.show()


fig, axes = plt.subplots(4, 2, figsize=(12, 18))
axes = axes.flatten()

for i, (column, color) in enumerate(zip(df.columns, boxplot_palette)):
    axes[i].boxplot(df[column], patch_artist=True, boxprops=dict(facecolor=color))
    axes[i].set_title(f'Boxplot {column}')

plt.tight_layout()
plt.show()

In [None]:
#Korelasi
correlation_matrix = df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Matriks Korelasi')
plt.show()

# Data Preparation

## Splitting

In [None]:
# Misalkan 'data' adalah DataFrame Anda dan 'df' adalah fitur yang telah Anda ekstrak
X = df  # Fitur
y = data['hydrocarbon_formation_class']  # Label

# Split data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)



## Feature Transformation

In [None]:
def histogram_norm(dataframe,judul=''):
  for column in dataframe.select_dtypes(include=np.number).columns:
    plt.figure(figsize=(8, 6))

    # Plot histogram
    sns.histplot(dataframe[column], kde=False, bins=30, color='blue', alpha=0.6, label='Data Histogram')

    # Tambahkan kurva normal
    mean, std = dataframe[column].mean(), dataframe[column].std()
    x = np.linspace(dataframe[column].min(), dataframe[column].max(), 1000)
    y = norm.pdf(x, loc=mean, scale=std)
    plt.plot(x, y * len(dataframe[column]) * (dataframe[column].max() - dataframe[column].min()) / 30, color='red', label='Normal Curve')

    # Label dan judul
    plt.title(f'Distribusi {column} {judul}', fontsize=14)
    plt.xlabel(column, fontsize=12)
    plt.ylabel('Frekuensi', fontsize=12)
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
histogram_norm(X_train,judul='')

In [None]:
X_train.describe()

### Box-Cox Transformation

In [None]:
from scipy.special import boxcox1p

tmp_X_bc=X_train.copy()

lam = 0.3
for column in tmp_X_bc.columns:
  tmp_X_bc[column]=boxcox1p(tmp_X_bc[column],lam)

In [None]:
histogram_norm(tmp_X_bc,judul='dengan Box-Cox')

In [None]:
tmp_X_bc.apply(lambda col: shapiro(col)[1])

### Yeo-Johnson

In [None]:
from sklearn.preprocessing import PowerTransformer

tmp_X_yj = X_train.copy()

yj_transformer = PowerTransformer(method='yeo-johnson',standardize=False)
yeo_johnson_data = pd.DataFrame(yj_transformer.fit_transform(tmp_X_yj))
yeo_johnson_data.columns = tmp_X_yj.columns.values
yeo_johnson_data.index = tmp_X_yj.index.values
df_tf_temp = yeo_johnson_data
for i in df_tf_temp.columns:
 tmp_X_yj[i]=df_tf_temp[i]

In [None]:
histogram_norm(tmp_X_yj,judul='dengan Yeo-Johnson')

In [None]:
tmp_X_yj.apply(lambda col: shapiro(col)[1])

### Quantile Transformation

In [None]:
from sklearn.preprocessing import QuantileTransformer

tmp_X_qt = X_train.copy()

qt_transformer = QuantileTransformer(output_distribution='normal')
qt_data = pd.DataFrame(qt_transformer.fit_transform(tmp_X_qt))
qt_data.columns = tmp_X_qt.columns.values
qt_data.index = tmp_X_qt.index.values
df_tf_temp = qt_data
for i in df_tf_temp.columns:
 tmp_X_qt[i]=df_tf_temp[i]

In [None]:
histogram_norm(tmp_X_qt,judul='dengan Quantile Transformation')

In [None]:
tmp_X_qt.apply(lambda col: shapiro(col)[1])

### Selection

In [None]:
from scipy.stats import skew

trans_result=pd.DataFrame({'Kolom':df.columns})
tmp_bc=[]
tmp_yj=[]
tmp_qt=[]
for i in df.columns:
  tmp_bc.append(skew(tmp_X_bc[i]))
  tmp_yj.append(skew(tmp_X_yj[i]))
  tmp_qt.append(skew(tmp_X_qt[i]))

trans_result['Skewness setelah transformasi Box Cox']=tmp_bc
trans_result['Skewness setelah transformasi Yeo Johnson']=tmp_yj
trans_result['Skewness setelah transformasi Quantile']=tmp_qt
trans_result

In [None]:
from scipy.stats import kurtosis

trans_result_kurtosis=pd.DataFrame({'Kolom':df.columns})
tmp_bc=[]
tmp_yj=[]
tmp_qt=[]
for i in df.columns:
  tmp_bc.append(kurtosis(tmp_X_bc[i]))
  tmp_yj.append(kurtosis(tmp_X_yj[i]))
  tmp_qt.append(kurtosis(tmp_X_qt[i]))

trans_result_kurtosis['Kurtosis setelah transformasi Box Cox']=tmp_bc
trans_result_kurtosis['Kurtosis setelah transformasi Yeo Johnson']=tmp_yj
trans_result_kurtosis['Kurtosis setelah transformasi Quantile']=tmp_qt
trans_result_kurtosis

In [None]:
X_train=tmp_X_qt

X_test2 = pd.DataFrame(qt_transformer.transform(X_test))
X_test2.columns = X_test.columns.values
X_test2.index = X_test.index.values
X_test = X_test2

In [None]:
X_scaled2 = pd.DataFrame(qt_transformer.transform(X))
X_scaled2.columns = X.columns.values
X_scaled2.index = X.index.values
X_scaled = X_scaled2

In [None]:
X_train

## Feature Scaling

karena menggunakan Quatile transformation dengan output gaussian, masing masing kolom secara otomatis ditransformasi ke distribusi normal baku, atau distribusi normal dengan rataan nol dan standar deviasi 1, oleh karena itu tidak diperlukan tambahan scaling.

In [None]:
X_train.describe()

# Classification

## Setup

In [None]:
Train_accuracy={}
Test_accuracy={}
CrossValidation_accuracy={}

In [None]:
# Import necessary CuML libraries for SVM and model evaluation

In [None]:
from cuml.svm import SVC
from cuml.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from cuml.dask.common.utils import persist_across_workers
import cudf
import time
from tqdm.notebook import tqdm

## SVM Model Training with Grid Search CV

We'll implement Support Vector Machine using CuML's GPU-accelerated implementation and optimize hyperparameters using Grid Search Cross-Validation.

In [None]:
# Convert pandas DataFrames to cuDF for GPU processing
X_train_gpu = cudf.DataFrame.from_pandas(X_train)
X_test_gpu = cudf.DataFrame.from_pandas(X_test)
y_train_gpu = cudf.Series(y_train.values)
y_test_gpu = cudf.Series(y_test.values)

In [None]:
# Define parameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 0.01],
    'degree': [2, 3, 4]  # Only used for poly kernel
}

In [None]:
# Manual grid search implementation for CuML SVM
def manual_grid_search_cv(X, y, param_grid, cv=5):
    # Define our cross-validation splits
    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    
    # Store results
    results = []
    best_score = 0
    best_params = None
    
    # Generate all parameter combinations
    param_combinations = []
    for C in param_grid['C']:
        for kernel in param_grid['kernel']:
            for gamma in param_grid['gamma']:
                if kernel == 'poly':
                    for degree in param_grid['degree']:
                        param_combinations.append({'C': C, 'kernel': kernel, 'gamma': gamma, 'degree': degree})
                else:
                    param_combinations.append({'C': C, 'kernel': kernel, 'gamma': gamma})
    
    # Run grid search
    print(f"Evaluating {len(param_combinations)} parameter combinations with {cv}-fold cross-validation")
    for params in tqdm(param_combinations):
        cv_scores = []
        
        # Perform cross-validation
        for train_idx, val_idx in kfold.split(X.to_pandas(), y.to_pandas()):
            X_cv_train = X.iloc[train_idx]
            y_cv_train = y.iloc[train_idx]
            X_cv_val = X.iloc[val_idx]
            y_cv_val = y.iloc[val_idx]
            
            # Create and train model
            model = SVC(**params)
            model.fit(X_cv_train, y_cv_train)
            
            # Evaluate
            y_pred = model.predict(X_cv_val)
            score = accuracy_score(y_cv_val, y_pred)
            cv_scores.append(score)
        
        # Calculate mean CV score
        mean_cv_score = sum(cv_scores) / len(cv_scores)
        results.append({'params': params, 'mean_cv_score': mean_cv_score})
        
        # Update best parameters if needed
        if mean_cv_score > best_score:
            best_score = mean_cv_score
            best_params = params
    
    return {'results': results, 'best_params': best_params, 'best_score': best_score}

In [None]:
# Run grid search
start_time = time.time()
print("Starting grid search cross-validation...")
grid_search_results = manual_grid_search_cv(X_train_gpu, y_train_gpu, param_grid, cv=5)
print(f"Grid search completed in {time.time() - start_time:.2f} seconds")
print(f"Best parameters: {grid_search_results['best_params']}")
print(f"Best cross-validation score: {grid_search_results['best_score']:.4f}")

In [None]:
# Train final model with best parameters
best_params = grid_search_results['best_params']
print("Training final model with best parameters...")
final_model = SVC(**best_params)
final_model.fit(X_train_gpu, y_train_gpu)

In [None]:
# Evaluate the model
# Training accuracy
y_train_pred = final_model.predict(X_train_gpu)
train_accuracy = accuracy_score(y_train_gpu, y_train_pred)
print(f"Training accuracy: {train_accuracy:.4f}")

# Test accuracy
y_test_pred = final_model.predict(X_test_gpu)
test_accuracy = accuracy_score(y_test_gpu, y_test_pred)
print(f"Test accuracy: {test_accuracy:.4f}")

# Store results in the existing dictionaries
Train_accuracy['SVM'] = train_accuracy
Test_accuracy['SVM'] = test_accuracy
CrossValidation_accuracy['SVM'] = grid_search_results['best_score']

## Model Evaluation and Visualization

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_test_pred.to_pandas()))

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred.to_pandas())
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=sorted(y.unique()), 
            yticklabels=sorted(y.unique()))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - SVM Model')
plt.tight_layout()
plt.show()

In [None]:
# Compare all models (for when you have more models to compare)
models_comparison = pd.DataFrame({
    'Train Accuracy': Train_accuracy,
    'Test Accuracy': Test_accuracy,
    'CV Accuracy': CrossValidation_accuracy
})

models_comparison.sort_values(by='Test Accuracy', ascending=False)

In [None]:
# Execution time
end_notebook = time.time()
print(f"Total notebook execution time: {end_notebook - start_notebook:.2f} seconds")