In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('facies_vectors_23.csv')
feature_names = ['GR_diff_up', 'ILD_log10_diff_up', 'DeltaPHI_diff_up', 'PHIND_diff_up', 'PE_diff_up', 'NM_M_diff_up', 'RELPOS_diff_up','GR_diff_down', 'ILD_log10_diff_down', 'DeltaPHI_diff_down', 'PHIND_diff_down', 'PE_diff_down', 'NM_M_diff_down', 'RELPOS_diff_down','GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
data = data.fillna(data['PE'].mean())

In [4]:
def find_diff(row, well):
    if len(prev_depth_features[well]) == 0:
        prev_depth_features[well] = row.values[4:]
        return
    diff = row.values[4:] - prev_depth_features[well]
    prev_depth_features[well] = row.values[4:]
    return diff
data_well = dict()
data_well_inverse = dict()
prev_depth_features = dict()
new_data = pd.DataFrame()
prev_class= dict()
data_save = pd.DataFrame()
for well in set(data['Well Name']):
    prev_depth_features[well] = []
    prev_class[well] = []
    data_well[well] = data[data['Well Name'] == well]
    data_well[well] = data_well[well].sort_values(by=['Depth'])
    data_save = data_well[well].iloc[::-1]
    data_well[well]['diff_up'] = data_well[well].apply(lambda row: find_diff(row, well), axis=1)
    prev_depth_features[well] = []
    prev_class[well] = []
    data_save = data_save.apply(lambda row: find_diff(row, well), axis=1)
    data_well[well]['diff_down'] = data_save.iloc[::-1]
    data_well[well] = data_well[well].dropna()
    data_well[well]['GR_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][0], axis=1)
    data_well[well]['ILD_log10_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][1], axis=1)
    data_well[well]['DeltaPHI_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][2], axis=1)
    data_well[well]['PHIND_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][3], axis=1)
    data_well[well]['PE_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][4], axis=1)
    data_well[well]['NM_M_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][5], axis=1)
    data_well[well]['RELPOS_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][6], axis=1)
    data_well[well]['GR_diff_down'] = data_well[well].apply(lambda row: row['diff_down'][0], axis=1)
    data_well[well]['ILD_log10_diff_down'] = data_well[well].apply(lambda row: row['diff_down'][1], axis=1)
    data_well[well]['DeltaPHI_diff_down'] = data_well[well].apply(lambda row: row['diff_down'][2], axis=1)
    data_well[well]['PHIND_diff_down'] = data_well[well].apply(lambda row: row['diff_down'][3], axis=1)
    data_well[well]['PE_diff_down'] = data_well[well].apply(lambda row: row['diff_down'][4], axis=1)
    data_well[well]['NM_M_diff_down'] = data_well[well].apply(lambda row: row['diff_down'][5], axis=1)
    data_well[well]['RELPOS_diff_down'] = data_well[well].apply(lambda row: row['diff_down'][6], axis=1)
    new_data = pd.concat([new_data, data_well[well]])
    new_data = new_data.drop(['diff_up'], axis=1)
    new_data = new_data.drop(['diff_down'], axis=1)

In [5]:
def augment_features_window(X, N_neig):
    N_row = X.shape[0]
    N_feat = X.shape[1]
    X = np.vstack((np.zeros((N_neig, N_feat)),np.zeros((N_neig, N_feat)), X, np.zeros((N_neig, N_feat)),np.zeros((N_neig, N_feat))))
    X_aug = np.zeros((N_row, N_feat*(4*N_neig+1)))
    for r in np.arange(N_row) + N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
            if c != 0:
                this_row = np.hstack((this_row, (X[r] + X[r+c])/2))
        #print(len(this_row))
        X_aug[r-N_neig] = this_row

    return X_aug

def augment_features_gradient(X, depth):
    d_diff = np.diff(depth).reshape((-1, 1))
    d_diff[d_diff==0] = 0.001
    X_diff = np.diff(X, axis=0)
    X_grad = X_diff / d_diff
    X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
    
    return X_grad

def augment_features(X, well, depth, N_neig=1):
    X_aug = np.zeros((X.shape[0], X.shape[1]*(4*N_neig+1)))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
        #print(X_aug_win)
        #X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        #print(X_aug_grad)
        X_aug[w_idx, :] = X_aug_win
        #X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
        
    return X_aug


In [6]:
import numpy as np
import xgboost as xgb
import math
from sklearn.preprocessing import OneHotEncoder

print('start running example to used customized objective function')

params = {'max_depth': 2, 'eta': 0.1, 'silent': 1,
          'objective': 'multi:softprob', 'num_class': 2}

num_round = 2
def my_softmax(preds, dtrain):
    labels = dtrain.get_label()
    labels_hot = OneHotEncoder(sparse=False, n_values=2).fit_transform(labels.reshape(-1, 1))
    grad = preds - labels_hot
    hess = preds * (1.0-preds)

    return grad.flatten(), hess.flatten()

start running example to used customized objective function


In [7]:
def number_of_outlier(y_res):
    outliers = 0
    if y_res[0] != y_res[1]:
        outliers += 1
    if y_res[-1] != y_res[-2]:
        outliers += 1
    for index in range(1,len(y_res)-1):
        if ((y_res[index] != y_res[index-1]) and (y_res[index] != y_res[index+1])):
            outliers += 1
    return outliers/len(y_res)
            

In [10]:
import numpy.random as random
test = dict()
train = dict()
acc = 0
wells = set(data['Well Name'])
for well in wells:
# well = 'SHRIMPLIN'
    print(well)
    test[well] = new_data[new_data['Well Name'] == well]
    train[well] = new_data[new_data['Well Name'] != well]
    X_train = train[well][feature_names].values 
    y_train = train[well]['Facies'].values 
    X_test = test[well][feature_names].values 
    y_test = test[well]['Facies'].values 
    well_train = train[well]['Well Name'].values
    well_test = test[well]['Well Name'].values
    depth_train = train[well]['Depth'].values
    depth_test = test[well]['Depth'].values    

    X_aug_train = augment_features(X_train,well_train,depth_train)
    X_aug_test = augment_features(X_test,well_test,depth_test)

    robust = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_aug_train)
    X_train_robust = robust.transform(X_aug_train)
    X_test_robust = robust.transform(X_aug_test)

    scaler = StandardScaler().fit(X_train_robust)
    X_train_robust_norm = scaler.transform(X_train_robust)
    X_test_robust_norm = scaler.transform(X_test_robust)
    
    dtrain = xgb.DMatrix(X_train_robust_norm, label=y_train)
    dtest = xgb.DMatrix(X_test_robust_norm, label=y_test)
    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
    model = xgb.Booster(params, [dtrain])
    for _ in range(150):
        pred = model.predict(dtrain)
        g, h = my_softmax(pred, dtrain)
        model.boost(dtrain, g, h)
    yhat = model.predict(dtest)
    yhat_labels = np.argmax(yhat, axis=1)

    yhat_train = model.predict(dtrain)
    yhat_labels_train = np.argmax(yhat_train, axis=1)
    print(len(y_test), " ", len(yhat_labels))
    acc += f1_score(y_test, yhat_labels, average='micro')
    print(f1_score(y_test, yhat_labels, average='micro'))
    print("Score on train ", f1_score(y_train, yhat_labels_train , average='micro'))
    print("Outliers test", number_of_outlier(y_test))
    print("Outliers res", number_of_outlier(yhat_labels))
print('well, boosting of trees, ', acc/9)  

SHRIMPLIN
239   239
0.7238493723849372
Score on train  0.9412166780587833
Outliers test 0.0
Outliers res 0.05439330543933055
LUKE G U
244   244
0.7868852459016392
Score on train  0.9430727023319616
Outliers test 0.004098360655737705
Outliers res 0.01639344262295082
ALEXANDER D
206   206
0.8155339805825242
Score on train  0.9391711229946524
Outliers test 0.009708737864077669
Outliers res 0.04854368932038835
CROSS H CATTLE
187   187
0.6256684491978609
Score on train  0.9445544554455445
Outliers test 0.03208556149732621
Outliers res 0.0374331550802139
KIMZEY A
157   157
0.7770700636942676
Score on train  0.945631067961165
Outliers test 0.012738853503184714
Outliers res 0.05732484076433121
NEWBY
176   176
0.6818181818181818
Score on train  0.9416775884665793
Outliers test 0.017045454545454544
Outliers res 0.0
SHANKLE
204   204
0.6617647058823529
Score on train  0.937917222963952
Outliers test 0.0
Outliers res 0.049019607843137254
CHURCHMAN BIBLE
105   105
0.7904761904761904
Score on train 