In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('facies_vectors_0.csv')
feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS','GR_diff_up', 'ILD_log10_diff_up', 'DeltaPHI_diff_up', 'PHIND_diff_up', 'PE_diff_up', 'NM_M_diff_up', 'RELPOS_diff_up']
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
data = data.fillna(data['PE'].mean())

In [4]:
def find_diff(row, well):
    if len(prev_depth_features[well]) == 0:
        prev_depth_features[well] = row.values[4:]
        return
    diff = row.values[4:] - prev_depth_features[well]
    prev_depth_features[well] = row.values[4:]
    return diff
data_well = dict()
data_well_inverse = dict()
prev_depth_features = dict()
new_data = pd.DataFrame()
prev_class= dict()
data_save = pd.DataFrame()
for well in set(data['Well Name']):
    prev_depth_features[well] = []
    prev_class[well] = []
    data_well[well] = data[data['Well Name'] == well]
    data_well[well] = data_well[well].sort_values(by=['Depth'])
    data_save = data_well[well].iloc[::-1]
    data_well[well]['diff_up'] = data_well[well].apply(lambda row: find_diff(row, well), axis=1)
    prev_depth_features[well] = []
    prev_class[well] = []

    data_well[well] = data_well[well].dropna()
    data_well[well]['GR_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][0], axis=1)
    data_well[well]['ILD_log10_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][1], axis=1)
    data_well[well]['DeltaPHI_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][2], axis=1)
    data_well[well]['PHIND_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][3], axis=1)
    data_well[well]['PE_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][4], axis=1)
    data_well[well]['NM_M_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][5], axis=1)
    data_well[well]['RELPOS_diff_up'] = data_well[well].apply(lambda row: row['diff_up'][6], axis=1)

    new_data = pd.concat([new_data, data_well[well]])
    new_data = new_data.drop(['diff_up'], axis=1)


In [5]:
def augment_features_window(X, N_neig):
    N_row = X.shape[0]
    N_feat = X.shape[1]
    X = np.vstack((np.zeros((N_neig, N_feat)),np.zeros((N_neig, N_feat)), X, np.zeros((N_neig, N_feat)),np.zeros((N_neig, N_feat))))
    X_aug = np.zeros((N_row, N_feat*(4*N_neig+1)))
    for r in np.arange(N_row) + N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
            if c != 0:
                this_row = np.hstack((this_row, (X[r] + X[r+c])/2))
        #print(len(this_row))
        X_aug[r-N_neig] = this_row

    return X_aug

def augment_features_gradient(X, depth):
    d_diff = np.diff(depth).reshape((-1, 1))
    d_diff[d_diff==0] = 0.001
    X_diff = np.diff(X, axis=0)
    X_grad = X_diff / d_diff
    X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
    
    return X_grad

def augment_features(X, well, depth, N_neig=1):
    X_aug = np.zeros((X.shape[0], X.shape[1]*(4*N_neig+1)))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
        #print(X_aug_win)
        #X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        #print(X_aug_grad)
        X_aug[w_idx, :] = X_aug_win
        #X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
        
    return X_aug


In [6]:
def knows(y_pred):
    max1 = max(y_pred)
    id1 = np.where(y_pred == max1)[0][0]
    max2 = 0
    for index in range(len(y_pred)):
        if ((y_pred[index]>max2) and (index != id1)):
            max2 = y_pred[index]
            id2 = index
    max3 = 0
    for index in range(len(y_pred)):
        if ((y_pred[index]>max3) and (index != id1) and (index != id2)):
            max3 = y_pred[index]
            id3 = index
#     print(max1, " ", max2, " ", max3)
#     print(id1, " ", id2, " ", id3)
    if ((abs(max1 - max2) < 1/6*max2) and  (abs(max2 - max3) > 2*max3)):
#         print(y_pred)
        return [False, id1, id2]
    return [True]

In [7]:
import numpy as np
import xgboost as xgb
import math
from sklearn.preprocessing import OneHotEncoder

print('start running example to used customized objective function')

params = {'max_depth': 2, 'eta': 0.1, 'silent': 1,
          'objective': 'multi:softprob', 'num_class': 2}

num_round = 2
def my_softmax(preds, dtrain):
    labels = dtrain.get_label()
    labels_hot = OneHotEncoder(sparse=False, n_values=2).fit_transform(labels.reshape(-1, 1))
    grad = preds - labels_hot
    hess = preds * (1.0-preds)

    return grad.flatten(), hess.flatten()

start running example to used customized objective function


In [11]:
def most_similar(y_res, y_test):
    for index in range(len(y_res)):
        if (y_res[index] != y_test[index]):
            classes[y_res[index], y_test[index]] += 1
            

In [12]:
def find_sum_classes(y_test):
    for index in range(len(y_test)):
        sum_classes[y_test[index]] += 1

In [16]:
def similar_percent():
    for class1 in range(9):           
        for class2 in range(9):
            classes[class1, class2] = classes[class1, class2]/sum_classes[class1]

In [1]:
import numpy.random as random
test = dict()
train = dict()
acc = 0
wells = set(data['Well Name'])
classes = dict()
for class1 in range(9):
    for class2 in range (9):
        classes[class1, class2] = 0
sum_classes = dict()
for class1 in range(9):
    sum_classes[class1] = 0
group1 = [0, 8, 6, 5, 4]
for well in wells:
# well = 'SHRIMPLIN'
    print(well)
    test[well] = new_data[new_data['Well Name'] == well]
    train[well] = new_data[new_data['Well Name'] != well]
    X_train = train[well][feature_names].values 
    y_train = train[well]['Facies'].values 
    X_test = test[well][feature_names].values 
    y_test = test[well]['Facies'].values 
    well_train = train[well]['Well Name'].values
    well_test = test[well]['Well Name'].values
    depth_train = train[well]['Depth'].values
    depth_test = test[well]['Depth'].values    

    X_aug_train = augment_features(X_train,well_train,depth_train)
    X_aug_test = augment_features(X_test,well_test,depth_test)

    robust = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_aug_train)
    X_train_robust = robust.transform(X_aug_train)
    X_test_robust = robust.transform(X_aug_test)

    scaler = StandardScaler().fit(X_train_robust)
    X_train_robust_norm = scaler.transform(X_train_robust)
    X_test_robust_norm = scaler.transform(X_test_robust)
    
    for index in range(len(y_train)):
        if 
    dtrain = xgb.DMatrix(X_train_robust_norm, label=y_train)
    dtest = xgb.DMatrix(X_test_robust_norm, label=y_test)
    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
    #     model = xgb.train(params, dtrain, 100)
    #     bst = xgb.train(param, dtrain, num_round, watchlist, obj=my_logregobj)
    model = xgb.Booster(params, [dtrain])
    for _ in range(150):
        pred = model.predict(dtrain)
        g, h = my_softmax(pred, dtrain)
        model.boost(dtrain, g, h)
    # Evalute
    yhat = model.predict(dtest)
    yhat_train = model.predict(dtrain)
    yhat_labels = np.argmax(yhat, axis=1)
    yhat_labels_train = np.argmax(yhat_train, axis=1)
    #     ypred = bst.predict(dtest)
    print(len(y_test), " ", len(yhat_labels))
    acc += f1_score(y_test, yhat_labels, average='micro')
    print(f1_score(y_test, yhat_labels, average='micro'))
    print("Score on train ", f1_score(y_train, yhat_labels_train , average='micro'))
    if (well != "Recruit F9"):
        print("Score 23 ", score23(yhat_labels, y_test) )
        acc_23 += score23(yhat_labels, y_test)
        print("Score 68 ", score68(yhat_labels, y_test) )
        acc_68 += score68(yhat_labels, y_test)
#     for index in range(len(y_test)):
#         print(y_test[index], yhat_labels[index])
    print("Outliers test", number_of_outlier(y_test))
    print("Outliers res", number_of_outlier(yhat_labels))
    most_similar(yhat_labels, y_test)
    find_sum_classes(y_test)
print('well, boosting of trees, ', acc/10) 
print('well, boosting of trees 23, ', acc_23/9) 
print('well, boosting of trees 23, ', acc_68/9) 
similar_percent()

IndentationError: expected an indented block (<ipython-input-1-368c1357b066>, line 39)

In [18]:
classes

{(0, 0): 0.0,
 (0, 1): 0.0,
 (0, 2): 0.005434782608695652,
 (0, 3): 0.0,
 (0, 4): 0.0,
 (0, 5): 0.03260869565217391,
 (0, 6): 0.03804347826086957,
 (0, 7): 0.005434782608695652,
 (0, 8): 0.2554347826086957,
 (1, 0): 0.003745318352059925,
 (1, 1): 0.0,
 (1, 2): 0.13857677902621723,
 (1, 3): 0.0299625468164794,
 (1, 4): 0.0,
 (1, 5): 0.0,
 (1, 6): 0.0,
 (1, 7): 0.0,
 (1, 8): 0.0,
 (2, 0): 0.0,
 (2, 1): 0.15474919957310565,
 (2, 2): 0.0,
 (2, 3): 0.2486659551760939,
 (2, 4): 0.0010672358591248667,
 (2, 5): 0.0064034151547491995,
 (2, 6): 0.0,
 (2, 7): 0.0,
 (2, 8): 0.004268943436499467,
 (3, 0): 0.0,
 (3, 1): 0.04774193548387097,
 (3, 2): 0.2645161290322581,
 (3, 3): 0.0,
 (3, 4): 0.005161290322580645,
 (3, 5): 0.0064516129032258064,
 (3, 6): 0.0025806451612903226,
 (3, 7): 0.0012903225806451613,
 (3, 8): 0.012903225806451613,
 (4, 0): 0.0,
 (4, 1): 0.0,
 (4, 2): 0.01107011070110701,
 (4, 3): 0.007380073800738007,
 (4, 4): 0.0,
 (4, 5): 0.16605166051660517,
 (4, 6): 0.2066420664206642,
 (