In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('facies_vectors_0.csv')
feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS','GR_diff_up', 'ILD_log10_diff_up', 'DeltaPHI_diff_up', 'PHIND_diff_up', 'PE_diff_up', 'NM_M_diff_up', 'RELPOS_diff_up']
feature_names_23 = ['GR_diff_up', 'ILD_log10_diff_up', 'DeltaPHI_diff_up', 'PHIND_diff_up', 'PE_diff_up', 'NM_M_diff_up', 'RELPOS_diff_up','GR_diff_down', 'ILD_log10_diff_down', 'DeltaPHI_diff_down', 'PHIND_diff_down', 'PE_diff_down', 'NM_M_diff_down', 'RELPOS_diff_down','GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
feature_names_original = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
data = data.fillna(data['PE'].mean())

In [4]:
def find_diff(row, well):
    if len(prev_depth_features[well]) == 0:
        prev_depth_features[well] = row.values[4:]
        return
    diff = row.values[4:] - prev_depth_features[well]
    prev_depth_features[well] = row.values[4:]
    return diff
data_well = dict()
data_well_inverse = dict()
prev_depth_features = dict()
new_data = pd.DataFrame()
prev_class= dict()
data_save = pd.DataFrame()
for well in set(data['Well Name']):
    prev_depth_features[well] = []
    prev_class[well] = []
    data_well[well] = data[data['Well Name'] == well]
    data_well[well] = data_well[well].sort_values(by=['Depth'])
    data_save = data_well[well].iloc[::-1]
    data_well[well]['diff_up'] = data_well[well].apply(lambda row: find_diff(row, well), axis=1)
    prev_depth_features[well] = []
    prev_class[well] = []

    data_well[well] = data_well[well].dropna()
    data_well[well]['GR_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][0], axis=1)
    data_well[well]['ILD_log10_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][1], axis=1)
    data_well[well]['DeltaPHI_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][2], axis=1)
    data_well[well]['PHIND_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][3], axis=1)
    data_well[well]['PE_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][4], axis=1)
    data_well[well]['NM_M_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][5], axis=1)
    data_well[well]['RELPOS_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][6], axis=1)

    new_data = pd.concat([new_data, data_well[well]])
    new_data = new_data.drop(['diff_up'], axis=1)

In [5]:
def augment_features_window(X, N_neig):
    N_row = X.shape[0]
    N_feat = X.shape[1]
    X = np.vstack((np.zeros((N_neig, N_feat)),np.zeros((N_neig, N_feat)), X, np.zeros((N_neig, N_feat)),np.zeros((N_neig, N_feat))))
    X_aug = np.zeros((N_row, N_feat*(4*N_neig+1)))
    for r in np.arange(N_row) + N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
            if c != 0:
                this_row = np.hstack((this_row, (X[r] + X[r+c])/2))
        #print(len(this_row))
        X_aug[r-N_neig] = this_row

    return X_aug

def augment_features_gradient(X, depth):
    d_diff = np.diff(depth).reshape((-1, 1))
    d_diff[d_diff==0] = 0.001
    X_diff = np.diff(X, axis=0)
    X_grad = X_diff / d_diff
    X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
    
    return X_grad

def augment_features(X, well, depth, N_neig=1):
    X_aug = np.zeros((X.shape[0], X.shape[1]*(4*N_neig+1)))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
        #print(X_aug_win)
        #X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        #print(X_aug_grad)
        X_aug[w_idx, :] = X_aug_win
        #X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
        
    return X_aug


In [6]:
import numpy as np
import xgboost as xgb
import math
from sklearn.preprocessing import OneHotEncoder

print('start running example to used customized objective function')

params_pair = {'max_depth': 2, 'eta': 0.1, 'silent': 1,
          'objective': 'multi:softprob', 'num_class': 2}
params = {'max_depth': 2, 'eta': 0.1, 'silent': 1,
          'objective': 'multi:softprob', 'num_class': 9}

num_round = 2
def my_softmax(preds, dtrain):
    labels = dtrain.get_label()
    labels_hot = OneHotEncoder(sparse=False, n_values=9).fit_transform(labels.reshape(-1, 1))
    grad = preds - labels_hot
    hess = preds * (1.0-preds)

    return grad.flatten(), hess.flatten()
def my_softmax_pair(preds, dtrain):
    labels = dtrain.get_label()
    labels_hot = OneHotEncoder(sparse=False, n_values=2).fit_transform(labels.reshape(-1, 1))
    grad = preds - labels_hot
    hess = preds * (1.0-preds)

    return grad.flatten(), hess.flatten()

start running example to used customized objective function


In [7]:
def number_of_outlier(y_res):
    outliers = 0
    if y_res[0] != y_res[1]:
        outliers += 1
    if y_res[-1] != y_res[-2]:
        outliers += 1
    for index in range(1,len(y_res)-1):
        if ((y_res[index] != y_res[index-1]) and (y_res[index] != y_res[index+1])):
            outliers += 1
    return outliers/len(y_res)
            

In [8]:
def most_similar(y_res, y_test):
    for index in range(len(y_res)):
        if (y_res[index] != y_test[index]):
            classes[y_res[index], y_test[index]] += 1
            

In [9]:
def scorepair(y_pred, y_test, class1, class2):
    y_pred_pair = []
    y_test_pair = []
    for index in range(len(y_test)):
        if ((y_test[index] == class1) or (y_test[index] == class2)):
            y_test_pair.append(y_test[index])
            y_pred_pair.append(y_pred[index])
    return f1_score(y_test_pair, y_pred_pair , average='micro')

In [21]:
def foundclass(y_pred, y_test, class1):
    y_class = 0
    sum_class = 0
    for index in range(len(y_test)):
        if (y_test[index] == class1):
            sum_class += 1
            if (y_pred[index] == class1):
                y_class += 1
    if (sum_class == 0):
        return 0
    return y_class/sum_class

def falsefoundclass(y_pred, y_test, class1):
    y_class = 0
    sum_class = 0
    for index in range(len(y_test)):
        if (y_pred[index] == class1):
            sum_class += 1
        if (y_test[index] != class1 and y_pred[index] == class1):
            y_class += 1
    if (sum_class == 0):
        return 0
    return y_class/sum_class

def better(y_pred1, y_pred2, y_test):
    bet = 0
    for index in range(len(y_test)):
        if (y_pred1[index] != y_test[index] and y_pred2[index] == y_test[index]):
            bet +=1
    return bet

def worse(y_pred1, y_pred2, y_test):
    wor = 0
    for index in range(len(y_test)):
        if (y_pred1[index] == y_test[index] and y_pred2[index] != y_test[index]):
            wor +=1
    return wor

def change_sum(y_pred1, y_pred2):
    sum_ = 0
    for index in range(len(y_pred1)):
        if (y_pred1[index] != y_pred2[index]):
            sum_ += 1
    return sum_

In [22]:
import numpy.random as random
import copy
from sklearn.cluster import KMeans
test = dict()
train = dict()
test_all_pair= dict()
train_all_pair = dict()
acc_pair = 0
classes = dict()
for class1 in range(9):
    for class2 in range (9):
        classes[class1, class2] = 0
acc = 0
wells = set(data['Well Name'])
# wells_68 = set(new_data_68['Well Name'])
change = 0
for well in wells:
# well = 'SHRIMPLIN'
    print(well)
    test[well] = new_data[new_data['Well Name'] == well]
    train[well] = new_data[new_data['Well Name'] != well]
    X_train = train[well][feature_names].values 
    y_train = train[well]['Facies'].values 
    X_test = test[well][feature_names].values
#     X_test = X_test[0:-1]
    y_test = test[well]['Facies'].values 
#     y_test = y_test[0:-1]
    well_train = train[well]['Well Name'].values
    well_test = test[well]['Well Name'].values
#     well_test = well_test[0:-1]
    depth_train = train[well]['Depth'].values
    depth_test = test[well]['Depth'].values   
#     depth_test = depth_test[0:-1]
     
    X_aug_train = augment_features(X_train,well_train,depth_train)
    X_aug_test = augment_features(X_test,well_test,depth_test)

    robust = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_aug_train)
    X_train_robust = robust.transform(X_aug_train)
    X_test_robust = robust.transform(X_aug_test)

    scaler = StandardScaler().fit(X_train_robust)
    X_train_robust_norm = scaler.transform(X_train_robust)
    X_test_robust_norm = scaler.transform(X_test_robust)
    
    dtrain = xgb.DMatrix(X_train_robust_norm, label=y_train)
    dtest = xgb.DMatrix(X_test_robust_norm, label=y_test)
    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
    model = xgb.Booster(params, [dtrain])
    for _ in range(150):
        pred = model.predict(dtrain)
        g, h = my_softmax(pred, dtrain)
        model.boost(dtrain, g, h)
    yhat = model.predict(dtest)
    yhat_labels = np.argmax(yhat, axis=1)
    print("BEFORE ", f1_score(y_test, yhat_labels, average='micro'))
    
    yhat_labels_prev = copy.deepcopy(yhat_labels)
    for class1 in range(8):
        for class2 in range(class1+1, 9):
            new_data_class1 = new_data[new_data['Facies'] == class1]
            new_data_class2 = new_data[new_data['Facies'] == class2]
            new_data_pair = pd.concat([new_data_class1, new_data_class2])
            wells_pair = set(new_data_pair['Well Name'])
            if (well in wells_pair):
                indeces_notpair = []
                for index in range(len(y_train)):
                    if ((y_train[index] != class1) and (y_train[index] != class2)):
                        indeces_notpair.append(index)
        
                X_train_robust_norm_pair = np.delete(X_train_robust_norm, indeces_notpair, 0)
                y_train_pair = np.delete(y_train, indeces_notpair)
        
                for index in range(len(y_train_pair)):
                    if (y_train_pair[index] == class1):
                        y_train_pair[index] = 0
                    if (y_train_pair[index] == class2):
                        y_train_pair[index] = 1
            
                indeces_notpair = []
                for index in range(len(yhat_labels)):
                    if ((yhat_labels[index] != class1) and (yhat_labels[index] != class2)):
                        indeces_notpair.append(index)
        
                X_test_robust_norm_pair = np.delete(X_test_robust_norm, indeces_notpair, 0)
                if (len(X_test_robust_norm_pair) == 0):
                    continue
                y_test_pair = np.delete(y_test, indeces_notpair)
        
                dtrain_pair = xgb.DMatrix(np.array(X_train_robust_norm_pair), label=np.array(y_train_pair))
                dtest_pair = xgb.DMatrix(np.array(X_test_robust_norm_pair), label=np.array(y_test_pair))
                watchlist = [(dtest_pair, 'eval'), (dtrain_pair, 'train')]
        
                model_pair = xgb.Booster(params_pair, [dtrain_pair])
                for _ in range(150):
                    pred = model_pair.predict(dtrain_pair)
                    g, h = my_softmax_pair(pred, dtrain_pair)
                    model_pair.boost(dtrain_pair, g, h)
        
                yhat_pair = model_pair.predict(dtest_pair)
                yhat_labels_pair = np.argmax(yhat_pair, axis=1)
        
                for index in range(len(yhat_labels_pair)):
                    if (yhat_labels_pair[index] == 0):
                        yhat_labels_pair[index] = class1
                    if (yhat_labels_pair[index] == 1):
                        yhat_labels_pair[index] = class2

                ind = 0
                for index in range(len(yhat_labels)):
                    if index not in indeces_notpair:
                        if ((yhat_labels[index] == class1) and (yhat_labels_pair[ind] == class2)):
                            change += 1
                        if ((yhat_labels[index] == class1) and (yhat_labels_pair[ind] == class2)):
                            change += 1
                        yhat_labels[index] = yhat_labels_pair[ind]
                        ind  += 1
                print("True detected ", class1, " ", foundclass(yhat_labels_prev, y_test, class1))
                print("False detected ", class1, " ",  falsefoundclass(yhat_labels_prev, y_test, class1))
                print("True detected ", class2, " ",  foundclass(yhat_labels_prev, y_test, class2))
                print("False detected ", class2, " ",  falsefoundclass(yhat_labels_prev, y_test, class2))
                print("Better results, id ", better(yhat_labels_prev, yhat_labels, y_test))
                print("Worse results, id ", worse(yhat_labels_prev, yhat_labels, y_test))
                print("Change ", change_sum(yhat_labels_prev, yhat_labels))
                print("AFTER ", class1, " ", class2, " ", f1_score(y_test, yhat_labels, average='micro'))
    acc += f1_score(y_test, yhat_labels, average='micro')
    print("AFTER ", f1_score(y_test, yhat_labels, average='micro'))
#     print("Score on train ", f1_score(y_train, yhat_labels_train , average='micro'))
#     print("Outliers test", number_of_outlier(y_test))
#     print("Outliers res", number_of_outlier(yhat_labels))
#     print("Change ", change)
    most_similar(yhat_labels, y_test)
print('well, boosting of trees, ', acc/10)  
print('well, boosting of trees 68, ', acc_68/9) 
print(change)

NEWBY
BEFORE  0.5865800865800865
True detected  0   0.0
False detected  0   0
True detected  2   0.7551020408163265
False detected  2   0.308411214953271
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   2   0.5865800865800865
True detected  0   0.0
False detected  0   0
True detected  3   0.5822784810126582
False detected  3   0.3333333333333333
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   3   0.5865800865800865
True detected  0   0.0
False detected  0   0
True detected  4   0.7068965517241379
False detected  4   0.18
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   4   0.5865800865800865
True detected  0   0.0
False detected  0   0
True detected  5   0.03571428571428571
False detected  5   0.6666666666666666
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   5   0.5865800865800865
True detected  0   0.0
False detected  0   0
True detected  6   0.5833333333333334
False detected  6   0.45098039215686275
Better results, id  0


True detected  0   1.0
False detected  0   0.5
True detected  4   0.6111111111111112
False detected  4   0.6071428571428571
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   4   0.648936170212766
True detected  0   1.0
False detected  0   0.5
True detected  5   0.14285714285714285
False detected  5   0.18181818181818182
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   5   0.648936170212766
True detected  0   1.0
False detected  0   0.5
True detected  6   0.7301587301587301
False detected  6   0.5
Better results, id  0
Worse results, id  1
Change  2
AFTER  0   6   0.6468085106382979
True detected  0   1.0
False detected  0   0.5
True detected  7   0.0
False detected  7   1.0
Better results, id  0
Worse results, id  1
Change  3
AFTER  0   7   0.6468085106382979
True detected  0   1.0
False detected  0   0.5
True detected  8   0.6811594202898551
False detected  8   0.3561643835616438
Better results, id  1
Worse results, id  1
Change  7
AFTER  0   8   0.6489361

True detected  0   0
False detected  0   0
True detected  7   0.35
False detected  7   0.3
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   7   0.6413043478260869
True detected  0   0
False detected  0   0
True detected  8   0.7297297297297297
False detected  8   0.4
Better results, id  0
Worse results, id  0
Change  1
AFTER  0   8   0.6413043478260869
True detected  1   0
False detected  1   1.0
True detected  2   0.6837606837606838
False detected  2   0.30434782608695654
Better results, id  19
Worse results, id  0
Change  23
AFTER  1   2   0.6826086956521739
True detected  1   0
False detected  1   1.0
True detected  3   0.6953125
False detected  3   0.1834862385321101
Better results, id  19
Worse results, id  0
Change  23
AFTER  1   3   0.6826086956521739
True detected  1   0
False detected  1   1.0
True detected  4   0.8285714285714286
False detected  4   0.3695652173913043
Better results, id  19
Worse results, id  0
Change  23
AFTER  1   4   0.6826086956521739
True 

True detected  1   0.0
False detected  1   0
True detected  2   0.8235294117647058
False detected  2   0.32038834951456313
Better results, id  2
Worse results, id  0
Change  7
AFTER  1   2   0.5639269406392694
True detected  1   0.0
False detected  1   0
True detected  3   0.6216216216216216
False detected  3   0.28125
Better results, id  2
Worse results, id  0
Change  7
AFTER  1   3   0.5639269406392694
True detected  1   0.0
False detected  1   0
True detected  4   0.5813953488372093
False detected  4   0.2857142857142857
Better results, id  2
Worse results, id  0
Change  7
AFTER  1   4   0.5639269406392694
True detected  1   0.0
False detected  1   0
True detected  5   0.018867924528301886
False detected  5   0.6666666666666666
Better results, id  2
Worse results, id  0
Change  7
AFTER  1   5   0.5639269406392694
True detected  1   0.0
False detected  1   0
True detected  6   0.6274509803921569
False detected  6   0.7333333333333333
Better results, id  2
Worse results, id  0
Change 

True detected  0   0.9
False detected  0   0.021739130434782608
True detected  2   0.5892857142857143
False detected  2   0.2978723404255319
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   2   0.5930521091811415
True detected  0   0.9
False detected  0   0.021739130434782608
True detected  3   0.76
False detected  3   0.4153846153846154
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   3   0.5930521091811415
True detected  0   0.9
False detected  0   0.021739130434782608
True detected  4   0.9230769230769231
False detected  4   0.6470588235294118
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   4   0.5930521091811415
True detected  0   0.9
False detected  0   0.021739130434782608
True detected  5   0.4666666666666667
False detected  5   0.7358490566037735
Better results, id  0
Worse results, id  0
Change  1
AFTER  0   5   0.5930521091811415
True detected  0   0.9
False detected  0   0.021739130434782608
True detected  6   0.6091954022988506


True detected  0   0
False detected  0   0
True detected  2   0.5319148936170213
False detected  2   0.5341614906832298
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   2   0.452
True detected  0   0
False detected  0   0
True detected  3   0.6808510638297872
False detected  3   0.7593984962406015
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   3   0.452
True detected  0   0
False detected  0   0
True detected  4   0.08
False detected  4   0.5
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   4   0.452
True detected  0   0
False detected  0   0
True detected  5   0.0
False detected  5   1.0
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   5   0.452
True detected  0   0
False detected  0   0
True detected  6   0.6451612903225806
False detected  6   0.7222222222222222
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   6   0.452
True detected  0   0
False detected  0   0
True detected  7   0.5
False detected  7   0.6

True detected  0   0
False detected  0   0
True detected  5   0.0
False detected  5   1.0
Better results, id  0
Worse results, id  0
Change  0
AFTER  0   5   0.5803571428571429
True detected  0   0
False detected  0   0
True detected  6   0.704225352112676
False detected  6   0.21875
Better results, id  0
Worse results, id  1
Change  1
AFTER  0   6   0.578125
True detected  0   0
False detected  0   0
True detected  7   0.8823529411764706
False detected  7   0.0625
Better results, id  0
Worse results, id  1
Change  1
AFTER  0   7   0.578125
True detected  0   0
False detected  0   0
True detected  8   0.8
False detected  8   0.3469387755102041
Better results, id  0
Worse results, id  1
Change  1
AFTER  0   8   0.578125
True detected  1   0.33707865168539325
False detected  1   0.25
True detected  2   0.8295454545454546
False detected  2   0.6054054054054054
Better results, id  5
Worse results, id  31
Change  41
AFTER  1   2   0.5223214285714286
True detected  1   0.33707865168539325
Fa

True detected  0   0
False detected  0   1.0
True detected  8   0.49137931034482757
False detected  8   0.2875
Better results, id  1
Worse results, id  1
Change  11
AFTER  0   8   0.5410628019323671
True detected  1   0.0
False detected  1   1.0
True detected  2   0.8632478632478633
False detected  2   0.24060150375939848
Better results, id  4
Worse results, id  1
Change  14
AFTER  1   2   0.5483091787439613
True detected  1   0.0
False detected  1   1.0
True detected  3   0.6176470588235294
False detected  3   0.288135593220339
Better results, id  4
Worse results, id  1
Change  14
AFTER  1   3   0.5483091787439613
True detected  1   0.0
False detected  1   1.0
True detected  4   0.10714285714285714
False detected  4   0.25
Better results, id  4
Worse results, id  1
Change  14
AFTER  1   4   0.5483091787439613
True detected  1   0.0
False detected  1   1.0
True detected  5   0.0851063829787234
False detected  5   0.75
Better results, id  4
Worse results, id  1
Change  14
AFTER  1   5  

True detected  1   0
False detected  1   0
True detected  4   0.8863636363636364
False detected  4   0.4264705882352941
Better results, id  1
Worse results, id  1
Change  6
AFTER  1   4   0.6365591397849463
True detected  1   0
False detected  1   0
True detected  5   0.46153846153846156
False detected  5   0.7735849056603774
Better results, id  1
Worse results, id  1
Change  6
AFTER  1   5   0.6365591397849463
True detected  1   0
False detected  1   0
True detected  6   0.028985507246376812
False detected  6   0.8
Better results, id  1
Worse results, id  1
Change  6
AFTER  1   6   0.6365591397849463
True detected  1   0
False detected  1   0
True detected  7   0.5625
False detected  7   0.71875
Better results, id  1
Worse results, id  1
Change  6
AFTER  1   7   0.6365591397849463
True detected  1   0
False detected  1   0
True detected  8   0.673469387755102
False detected  8   0.31958762886597936
Better results, id  1
Worse results, id  1
Change  6
AFTER  1   8   0.6365591397849463


NameError: name 'acc_68' is not defined