In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('facies_vectors_0.csv')
feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS','GR_diff_up', 'ILD_log10_diff_up', 'DeltaPHI_diff_up', 'PHIND_diff_up', 'PE_diff_up', 'NM_M_diff_up', 'RELPOS_diff_up']
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
data = data.fillna(data['PE'].mean())

In [4]:
def find_diff(row, well):
    if len(prev_depth_features[well]) == 0:
        prev_depth_features[well] = row.values[4:]
        return
    diff = row.values[4:] - prev_depth_features[well]
    prev_depth_features[well] = row.values[4:]
    return diff
data_well = dict()
data_well_inverse = dict()
prev_depth_features = dict()
new_data = pd.DataFrame()
prev_class= dict()
data_save = pd.DataFrame()
for well in set(data['Well Name']):
    prev_depth_features[well] = []
    prev_class[well] = []
    data_well[well] = data[data['Well Name'] == well]
    data_well[well] = data_well[well].sort_values(by=['Depth'])
    data_save = data_well[well].iloc[::-1]
    data_well[well]['diff_up'] = data_well[well].apply(lambda row: find_diff(row, well), axis=1)
    prev_depth_features[well] = []
    prev_class[well] = []

    data_well[well] = data_well[well].dropna()
    data_well[well]['GR_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][0], axis=1)
    data_well[well]['ILD_log10_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][1], axis=1)
    data_well[well]['DeltaPHI_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][2], axis=1)
    data_well[well]['PHIND_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][3], axis=1)
    data_well[well]['PE_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][4], axis=1)
    data_well[well]['NM_M_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][5], axis=1)
    data_well[well]['RELPOS_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][6], axis=1)

    new_data = pd.concat([new_data, data_well[well]])
    new_data = new_data.drop(['diff_up'], axis=1)


In [5]:
def augment_features_window(X, N_neig):
    N_row = X.shape[0]
    N_feat = X.shape[1]
    X = np.vstack((np.zeros((N_neig, N_feat)),np.zeros((N_neig, N_feat)), X, np.zeros((N_neig, N_feat)),np.zeros((N_neig, N_feat))))
    X_aug = np.zeros((N_row, N_feat*(4*N_neig+1)))
    for r in np.arange(N_row) + N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
            if c != 0:
                this_row = np.hstack((this_row, (X[r] + X[r+c])/2))
        #print(len(this_row))
        X_aug[r-N_neig] = this_row

    return X_aug

def augment_features_gradient(X, depth):
    d_diff = np.diff(depth).reshape((-1, 1))
    d_diff[d_diff==0] = 0.001
    X_diff = np.diff(X, axis=0)
    X_grad = X_diff / d_diff
    X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
    
    return X_grad

def augment_features(X, well, depth, N_neig=1):
    X_aug = np.zeros((X.shape[0], X.shape[1]*(4*N_neig+1)))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
        #print(X_aug_win)
        #X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        #print(X_aug_grad)
        X_aug[w_idx, :] = X_aug_win
        #X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
        
    return X_aug


In [6]:
def knows(y_pred):
    max1 = max(y_pred)
    id1 = np.where(y_pred == max1)[0][0]
    max2 = 0
    for index in range(len(y_pred)):
        if ((y_pred[index]>max2) and (index != id1)):
            max2 = y_pred[index]
            id2 = index
    max3 = 0
    for index in range(len(y_pred)):
        if ((y_pred[index]>max3) and (index != id1) and (index != id2)):
            max3 = y_pred[index]
            id3 = index
#     print(max1, " ", max2, " ", max3)
#     print(id1, " ", id2, " ", id3)
    if ((abs(max1 - max2) < 1/6*max2) and  (abs(max2 - max3) > 2*max3)):
#         print(y_pred)
        return [False, id1, id2]
    return [True]

In [7]:
import numpy as np
import xgboost as xgb
import math
from sklearn.preprocessing import OneHotEncoder

print('start running example to used customized objective function')

params = {'max_depth': 2, 'eta': 0.1, 'silent': 1,
          'objective': 'multi:softprob', 'num_class': 9}

num_round = 2
def my_softmax(preds, dtrain):
    labels = dtrain.get_label()
    labels_hot = OneHotEncoder(sparse=False, n_values=9).fit_transform(labels.reshape(-1, 1))
    grad = preds - labels_hot
    hess = preds * (1.0-preds)
    dtrain_len = dtrain.num_row()
    
    L = [[0]*9]*dtrain.num_row()
    H = [[0]*9]*dtrain.num_row()
    for object_ in range(1, dtrain_len-1):
        in_ = 0
        ind1 = labels[object_-1]
        ind2 = labels[object_+1]
        if (knows(preds[object_])[0] == False):
            ind_max1 = knows(preds[object_])[1]
            ind_max2 = knows(preds[object_])[2]
            if (((ind_max1 == ind1) or (ind_max2 == ind1)) and (ind_max1 != ind2) and (ind_max2 != ind2)):
                in_ += 1
#                 print("Только с ind1 совпадают")
                for index in range(9):
                    for class_ in range(9):
                        if (class_ != labels[object_-1]):
                            L[object_][index] = L[object_][index] - preds[object_][class_] * preds[object_][index]
                            H[object_][index] = H[object_][index] + 2 * preds[object_][index] * preds[object_][index] * preds[object_][class_] - preds[object_][index] * preds[object_][class_] 
                    H[object_][index] = H[object_][index] - 2 * preds[object_][index] * preds[object_][index] + preds[object_][index]
                    L[object_][index] = L[object_][index] + preds[object_][index]
#             Только с ind2 совпадают
            if (((ind_max1 == ind2) or (ind_max2 == ind2)) and (ind_max1 != ind1) and (ind_max2 != ind1)):
#                 print("Только с ind2 совпадают")
                in_ += 1
                for index in range(9):
                    for class_ in range(9):
                        if (class_ != labels[object_+1]):
                            L[object_][index] = L[object_][index] - preds[object_][class_] * preds[object_][index]
                            H[object_][index] = H[object_][index] + 2 * preds[object_][index] * preds[object_][index] * preds[object_][class_] - preds[object_][index] * preds[object_][class_] 
                    H[object_][index] = H[object_][index] - 2 * preds[object_][index] * preds[object_][index] + preds[object_][index]
                    L[object_][index] = L[object_][index] + preds[object_][index]
#             Оба совпадают
            if (((ind_max1 == ind2) or (ind_max2 == ind2)) and ((ind_max1 == ind1) or (ind_max2 == ind1))):
                if (max(preds[object_-1]) > max(preds[object_])):
                    if (max(preds[object_-1]) >= max(preds[object_+1])):
                        in_ += 1
#                         print("Оба совпадают предыдущий увереннее")
                        for index in range(9):
                            for class_ in range(9):
                                if (class_ != labels[object_-1]):
                                    L[object_][index] = L[object_][index] - preds[object_][class_] * preds[object_][index]
                                    H[object_][index] = H[object_][index] + 2 * preds[object_][index] * preds[object_][index] * preds[object_][class_] - preds[object_][index] * preds[object_][class_] 
                            H[object_][index] = H[object_][index] - 2 * preds[object_][index] * preds[object_][index] + preds[object_][index]
                            L[object_][index] = L[object_][index] + preds[object_][index]
                    else:
#                         print("Оба совпадают следующий увереннее")
                        in_ += 1
                        for index in range(9):
                            for class_ in range(9):
                                if (class_ != labels[object_+1]):
                                    L[object_][index] = L[object_][index] - preds[object_][class_] * preds[object_][index]
                                    H[object_][index] = H[object_][index] + 2 * preds[object_][index] * preds[object_][index] * preds[object_][class_] - preds[object_][index] * preds[object_][class_] 
                            H[object_][index] = H[object_][index] - 2 * preds[object_][index] * preds[object_][index] + preds[object_][index]
                            L[object_][index] = L[object_][index] + preds[object_][index]
#                 else:
#                     print("Оба совпадают текущий увереннее")
                        
        if (in_ > 1):
            print(in_, " больше 1")
#     print("grad ", grad[0])
#     print("L ", L[0])
    grad = grad - 0.001 *np.array(L)
#     print(in_1, " ", dtrain_len-2)
    hess = hess - 0.001*np.array(H)
    return grad.flatten(), hess.flatten()

start running example to used customized objective function


In [8]:
def number_of_outlier(y_res):
    outliers = 0
    if y_res[0] != y_res[1]:
        outliers += 1
    if y_res[-1] != y_res[-2]:
        outliers += 1
    for index in range(1,len(y_res)-1):
        if ((y_res[index] != y_res[index-1]) and (y_res[index] != y_res[index+1])):
            outliers += 1
    return outliers/len(y_res)
            

In [None]:
import numpy.random as random
test = dict()
train = dict()
acc = 0
wells = set(data['Well Name'])
for well in wells:
# well = 'SHRIMPLIN'
    print(well)
    test[well] = new_data[new_data['Well Name'] == well]
    train[well] = new_data[new_data['Well Name'] != well]
    X_train = train[well][feature_names].values 
    y_train = train[well]['Facies'].values 
    X_test = test[well][feature_names].values 
    y_test = test[well]['Facies'].values 
    well_train = train[well]['Well Name'].values
    well_test = test[well]['Well Name'].values
    depth_train = train[well]['Depth'].values
    depth_test = test[well]['Depth'].values    

    X_aug_train = augment_features(X_train,well_train,depth_train)
    X_aug_test = augment_features(X_test,well_test,depth_test)

    robust = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_aug_train)
    X_train_robust = robust.transform(X_aug_train)
    X_test_robust = robust.transform(X_aug_test)

    scaler = StandardScaler().fit(X_train_robust)
    X_train_robust_norm = scaler.transform(X_train_robust)
    X_test_robust_norm = scaler.transform(X_test_robust)
    
    dtrain = xgb.DMatrix(X_train_robust_norm, label=y_train)
    dtest = xgb.DMatrix(X_test_robust_norm, label=y_test)
    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
    #     model = xgb.train(params, dtrain, 100)
    #     bst = xgb.train(param, dtrain, num_round, watchlist, obj=my_logregobj)
    model = xgb.Booster(params, [dtrain])
    for _ in range(150):
        pred = model.predict(dtrain)
        g, h = my_softmax(pred, dtrain)
        model.boost(dtrain, g, h)
    # Evalute
    yhat = model.predict(dtest)
    yhat_labels = np.argmax(yhat, axis=1)
    #     ypred = bst.predict(dtest)
    print(len(y_test), " ", len(yhat_labels))
    acc += f1_score(y_test, yhat_labels, average='micro')
    print(f1_score(y_test, yhat_labels, average='micro'))
#     for index in range(len(y_test)):
#         print(y_test[index], yhat_labels[index])
    print("Outliers test", number_of_outlier(y_test))
    print("Outliers res", number_of_outlier(yhat_labels))
print('well, boosting of trees, ', acc/10)  

KIMZEY A
438   438
0.5616438356164384
Outliers test 0.0319634703196347
Outliers res 0.04794520547945205
NEWBY
462   462
0.577922077922078
Outliers test 0.032467532467532464
Outliers res 0.03463203463203463
LUKE G U
460   460
0.6478260869565218
Outliers test 0.006521739130434782
Outliers res 0.058695652173913045
SHANKLE
448   448
0.5513392857142857
Outliers test 0.006696428571428571
Outliers res 0.08035714285714286
SHRIMPLIN
470   470
0.6595744680851063
Outliers test 0.002127659574468085
Outliers res 0.05106382978723404
CROSS H CATTLE
500   500
0.452
Outliers test 0.044
Outliers res 0.054
Recruit F9
79   79
0.7468354430379747
Outliers test 0.0
Outliers res 0.08860759493670886
CHURCHMAN BIBLE
