In [1]:
import glob
import pandas as pd
import numpy as np


file_name_list = glob.glob("*.csv")

feature_set = ['DEPTH','CALI', 'DENB', 'DRHO', 'GR','RDEEP','NEUT', 'RMICRO', 'DTCOMP']

file_list = []

for file in file_name_list:
    df = pd.read_csv(file, index_col=None, skiprows=[1])
    file_list.append(df[feature_set])

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
import matplotlib.pyplot as plt

def plt_this(y):
    plt.plot(y)
    plt.show()


In [3]:
import os

def save_res(dept, pred_y, err, well, algo_name):
    res_path = "Results/%s"  %well
    if not os.path.isdir(res_path):
        os.makedirs(res_path)
        
    result = pd.DataFrame({'NEUT': dept,
                           'Pred_NEUT': pred_y})
    result.to_csv('%s/%s.csv' %(res_path, algo_name))
    
    score = pd.DataFrame({'ERROR' : [err]})
    score.to_csv('%s/%s_score.csv' %(res_path, algo_name))

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import explained_variance_score, r2_score

def cross_val(clf, algo_name, feature_set=['DEPTH','CALI', 'DENB', 'DRHO', 'GR', 'RDEEP', 'RMICRO', 'DTCOMP']):
    
    wells = []
    err_list = []

    for i in range(len(file_list)):
        test_df = file_list[i]
        wells.append(test_df.iloc[0,0])
        print('%s : %s' %(i, wells[i]))

        train_list = file_list.copy()
        train_list.pop(i)
        train_df = pd.concat(train_list)

        test_x = test_df[feature_set].values
        test_y = test_df[['NEUT']].values
        test_y = test_y.ravel()
        
        train_X = train_df[feature_set].values
        train_y = train_df[['NEUT']].values
        train_y = train_y.ravel()
        
        # feature scaling
        scaler = StandardScaler()
        scaler.fit_transform(train_X)
        scaler.transform(test_x)

        # training
        mdl = clf()
        mdl.fit(train_X, train_y)
        
        # testing
        pred_y = mdl.predict(test_x)
    
        # error
#         abs_error = np.divide((np.abs(np.subtract(test_y, pred_y))), test_y)
        
#         plt_this(abs_error)
#         plt_this(pred_y)
#         plt_this(test_y)
            
#         err = explained_variance_score(test_y, pred_y)
        err = r2_score(test_y, pred_y)
        
        err_list.append(err)
        
        # save results
        save_res(test_x[:,0], pred_y, err, wells[i], algo_name)

    print()

    avg_err = np.mean(err_list)

    for i in range(len(wells)):
        print('Test score on %s : %s' %(wells[i], err_list[i]))

    print()
    print('Average algorithm score: %s' %avg_err)


# Linear regression

In [5]:
from sklearn.linear_model import LinearRegression

feature_set = ['DEPTH', 'CALI', 'DENB', 'RDEEP', 'RMICRO'] #0.372

cross_val(LinearRegression, 'Linear regression', feature_set)

0 : 316.3824
1 : 335.4324
2 : 318.2112
3 : 268.3764
4 : 305.562
5 : 328.8792

Test score on 316.3824 : 0.996110547376185
Test score on 335.4324 : 0.9774568062439419
Test score on 318.2112 : 0.9834480676131058
Test score on 268.3764 : 0.7514095143061121
Test score on 305.562 : 0.999584912983053
Test score on 328.8792 : 0.5978585225272584

Average algorithm score: 0.8843113951749427


# Random Forrest

In [6]:
from sklearn.ensemble import RandomForestRegressor

cross_val(RandomForestRegressor, 'Random forrest')

0 : 316.3824




1 : 335.4324




2 : 318.2112




3 : 268.3764




4 : 305.562




5 : 328.8792





Test score on 316.3824 : 0.9993193267726772
Test score on 335.4324 : 0.9774198155900885
Test score on 318.2112 : 0.9869160262317125
Test score on 268.3764 : 0.7497278443631334
Test score on 305.562 : 0.977025448102083
Test score on 328.8792 : 0.6596773325413249

Average algorithm score: 0.8916809656001697


# Gradient Boosting

In [7]:
from lightgbm import LGBMRegressor

cross_val(LGBMRegressor, 'Gradient boosting')

0 : 316.3824
1 : 335.4324
2 : 318.2112
3 : 268.3764
4 : 305.562
5 : 328.8792

Test score on 316.3824 : 0.9979816351949283
Test score on 335.4324 : 0.9773096636945793
Test score on 318.2112 : 0.9885475227345948
Test score on 268.3764 : 0.7283395564012842
Test score on 305.562 : 0.9855857509113958
Test score on 328.8792 : -4.811721893305022

Average algorithm score: -0.022326294061373247


# SVR

In [None]:
from sklearn.svm import SVR

cross_val(SVR, 'Support vector machine')

0 : 316.3824




1 : 335.4324




2 : 318.2112




3 : 268.3764




# XGBoost

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import explained_variance_score, r2_score
from xgboost import plot_tree
import matplotlib.pyplot as plt


def cross_val(clf, algo_name, feature_set=['DEPTH','CALI', 'DENB', 'DRHO', 'GR','RDEEP', 'RMICRO', 'DTCOMP']):
    
    wells = []
    err_list = []

    for i in range(len(file_list)):
        test_df = file_list[i]
        wells.append(test_df.iloc[0,0])
        print('%s : %s' %(i, wells[i]))

        train_list = file_list.copy()
        train_list.pop(i)
        train_df = pd.concat(train_list)

        test_x = test_df[feature_set].values
        test_y = test_df[['NEUT']].values
        test_y = test_y.ravel()
        
        train_X = train_df[feature_set].values
        train_y = train_df[['NEUT']].values
        train_y = train_y.ravel()
        
        # feature scaling
        scaler = StandardScaler()
        scaler.fit_transform(train_X)
        scaler.transform(test_x)

        # training
        mdl = clf()
        mdl.fit(train_X, train_y)
        
        plot_tree(mdl, rankdir='LR')
        plt.show()
        fig = plt.gcf()
        fig.set_size_inches(150, 100)
#         fig.savefig('tree.png')
        fig.show()

        # testing
        pred_y = mdl.predict(test_x)
    
        # error
#         abs_error = np.divide((np.abs(np.subtract(test_y, pred_y))), test_y)
        
#         plt_this(abs_error)
#         plt_this(pred_y)
#         plt_this(test_y)
            
#         err = explained_variance_score(test_y, pred_y)
        err = r2_score(test_y, pred_y)
        
        err_list.append(err)
        
        # save results
        save_res(test_x[:,0], pred_y, err, wells[i], algo_name)

    print()

    avg_err = np.mean(err_list)

    for i in range(len(wells)):
        print('Test score on %s : %s' %(wells[i], err_list[i]))

    print()
    print('Average algorithm score: %s' %avg_err)


In [None]:
from xgboost import XGBRegressor

cross_val(XGBRegressor, 'xgbr')