In [3]:
import os

import numpy as np
import pandas as pd
import well_log_analysis.dataset

import sklearn
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV, LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import MinMaxScaler, RobustScaler

In [4]:
def train_test(train_wells, test_well, regressor):
    scaler = RobustScaler()
    train_dataset = well_log_analysis.dataset.WellDataset(train_wells[0], 
                                    ['GR', 'DEN', 'RT', 'DTCO', 'DTSM'], 
                                    ['GR', 'DEN', 'RT', 'DTCO', 'DTSM'], 
                                    sequence_length=1)
    
    for i in range(1, len(train_wells)):
        train_dataset += well_log_analysis.dataset.WellDataset(train_wells[i], 
                                        ['GR', 'DEN', 'RT', 'DTCO', 'DTSM'], 
                                        ['GR', 'DEN', 'RT', 'DTCO', 'DTSM'], 
                                        sequence_length=1)
    

    data = np.concatenate([train_dataset[i] for i in range(len(train_dataset))], 0)
    scaler.fit(data)
    data = scaler.transform(data)
    x, y = data[:, :3], data[:, 3:]
    regressor.fit(x, y)
    test_dataset = well_log_analysis.dataset.WellDataset(test_well,
                                ['GR', 'DEN', 'RT', 'DTCO', 'DTSM'], 
                                ['GR', 'DEN', 'RT', 'DTCO', 'DTSM'], 
                                sequence_length=1)
    
   
    test_data = np.concatenate([test_dataset[i] for i in range(len(test_dataset))], 0)
    test_data = scaler.transform(test_data)
    
    x_test, y_test = test_data[:, :3], test_data[:, 3:]
    prediction = regressor.predict(x_test)
    together = np.concatenate([x_test, prediction], 1)
    pred = scaler.inverse_transform(together)
    return pred[:, 3:], scaler.inverse_transform(test_data)[:, 3:]
    

In [5]:
dataset_paths = ['data/dataset1/J10025.las', 'data/dataset1/J10035.las', 'data/dataset1/J10038.las', 'data/dataset1/J10039.las', 'data/dataset1/J10051.las']


for path in dataset_paths:
    
    reg = LinearRegression()
    multi_reg = MultiOutputRegressor(reg)
    train_datasets = dataset_paths.copy()
    train_datasets.remove(path)
    
    y_pred, y_true = train_test(train_datasets, path, multi_reg)
    results_path = os.path.join("results", os.path.splitext(os.path.basename(path))[0] + ".csv")
    if os.path.exists(results_path) and not os.stat(results_path).st_size == 0:
        df = pd.read_csv(results_path, index_col=False)
    else:
        df = pd.DataFrame({})
    df['GroundTruth_DTCO'] = list(y_true[:, 0])
    df['GroundTruth_DTSM'] = list(y_true[:, 1])
    df['LinearRegression_DTCO'] = list(y_pred[:, 0])
    df['LinearRegression_DTSM'] = list(y_pred[:, 1])

    df.to_csv(results_path, index=False)
    print("Performance on {}".format(path))
    print("MAPE on DTCO", sklearn.metrics.mean_absolute_percentage_error(y_true[:, 0], y_pred[:, 0]))
    print("MAPE on DTSM", sklearn.metrics.mean_absolute_percentage_error(y_true[:, 1], y_pred[:, 1]))


Performance on data/dataset1/J10025.las
MAPE on DTCO 0.12988723257467633
MAPE on DTSM 0.19974300316572413
Performance on data/dataset1/J10035.las
MAPE on DTCO 0.09798206172229423
MAPE on DTSM 0.15894103581295244
Performance on data/dataset1/J10038.las
MAPE on DTCO 0.07669840157543513
MAPE on DTSM 0.13122195802711178
Performance on data/dataset1/J10039.las
MAPE on DTCO 0.0817874679666078
MAPE on DTSM 0.13879475686631465
Performance on data/dataset1/J10051.las
MAPE on DTCO 0.17103671871463622
MAPE on DTSM 0.33132270389593566
