# Facies classification using Machine Learning #
## LA Team Submission 5 ## 
### _[Lukas Mosser](https://at.linkedin.com/in/lukas-mosser-9948b32b/en), [Alfredo De la Fuente](https://pe.linkedin.com/in/alfredodelafuenteb)_ ####

In this approach for solving the facies classfication problem ( https://github.com/seg/2016-ml-contest. ) we will explore the following statregies:
- Features Exploration: based on [Paolo Bestagini's work](https://github.com/seg/2016-ml-contest/blob/master/ispl/facies_classification_try02.ipynb), we will consider imputation, normalization and augmentation routines for the initial features.
- Model tuning: 

## Libraries

We will need to install the following libraries and packages.

In [1]:
# %%sh
# pip install pandas
# pip install scikit-learn
# pip install tpot

In [2]:
from __future__ import print_function
import numpy as np
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold , StratifiedKFold
# from classification_utilities import display_cm, display_adj_cm
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import LeavePGroupsOut
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.signal import medfilt

from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.utils import to_categorical
import numpy as np

Using TensorFlow backend.
  return f(*args, **kwds)


## Data Preprocessing

In [3]:
#Load Data
data = pd.read_csv('../facies_vectors.csv')
# data = pd.read_csv('../ShiangYong/facies_vectors_imputedPE.csv')
# Parameters
feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

# data.dropna(inplace=True)
# Store features and labels
X = data[feature_names].values 
y = data['Facies'].values 

# Store well labels and depths
well = data['Well Name'].values
depth = data['Depth'].values
# X = np.array(pd.DataFrame(X).dropna())
# Fill 'PE' missing values with mean
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4149 entries, 0 to 4148
Data columns (total 11 columns):
Facies       4149 non-null int64
Formation    4149 non-null object
Well Name    4149 non-null object
Depth        4149 non-null float64
GR           4149 non-null float64
ILD_log10    4149 non-null float64
DeltaPHI     4149 non-null float64
PHIND        4149 non-null float64
PE           3232 non-null float64
NM_M         4149 non-null int64
RELPOS       4149 non-null float64
dtypes: float64(7), int64(2), object(2)
memory usage: 356.6+ KB


We procceed to run [Paolo Bestagini's routine](https://github.com/seg/2016-ml-contest/blob/master/ispl/facies_classification_try02.ipynb) to include a small window of values to acount for the spatial component in the log analysis, as well as the gradient information with respect to depth. This will be our prepared training dataset.

In [5]:
# Feature windows concatenation function
def augment_features_window(X, N_neig):
    
    # Parameters
    N_row = X.shape[0]
    N_feat = X.shape[1]

    # Zero padding
    X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))

    # Loop over windows
    X_aug = np.zeros((N_row, N_feat*(2*N_neig+1)))
    for r in np.arange(N_row)+N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
        X_aug[r-N_neig] = this_row

    return X_aug


# Feature gradient computation function
def augment_features_gradient(X, depth):
    
    # Compute features gradient
    d_diff = np.diff(depth).reshape((-1, 1))
    d_diff[d_diff==0] = 0.001
    X_diff = np.diff(X, axis=0)
    X_grad = X_diff / d_diff
        
    # Compensate for last missing value
    X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
    
    return X_grad


# Feature augmentation function
def augment_features(X, well, depth, N_neig=1):
    
    # Augment features
    X_aug = np.zeros((X.shape[0], X.shape[1]*(N_neig*2+2)))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
        X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
    
    # Find padded rows
    padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
    
    return X_aug, padded_rows

In [6]:
X_aug, padded_rows = augment_features(X, well, depth)

In [7]:
# # Initialize model selection methods
# lpgo = LeavePGroupsOut(2)

# # Generate splits
# split_list = []
# for train, val in lpgo.split(X, y, groups=data['Well Name']):
#     hist_tr = np.histogram(y[train], bins=np.arange(len(facies_names)+1)+.5)
#     hist_val = np.histogram(y[val], bins=np.arange(len(facies_names)+1)+.5)
#     if np.all(hist_tr[0] != 0) & np.all(hist_val[0] != 0):
#         split_list.append({'train':train, 'val':val})

In [8]:
def preprocess():
    
    # Preprocess data to use in model
    X_train_aux = []
    X_test_aux = []
    y_train_aux = []
    y_test_aux = []
    
    # For each data split
    split = split_list[5]
        
    # Remove padded rows
    split_train_no_pad = np.setdiff1d(split['train'], padded_rows)

    # Select training and validation data from current split
    X_tr = X_aug[split_train_no_pad, :]
    X_v = X_aug[split['val'], :]
    y_tr = y[split_train_no_pad]
    y_v = y[split['val']]

    # Select well labels for validation data
    well_v = well[split['val']]

    # Feature normalization
    scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_tr)
    X_tr = scaler.transform(X_tr)
    X_v = scaler.transform(X_v)
        
    X_train_aux.append( X_tr )
    X_test_aux.append( X_v )
    y_train_aux.append( y_tr )
    y_test_aux.append (  y_v )
    
    X_train = np.concatenate( X_train_aux )
    X_test = np.concatenate ( X_test_aux )
    y_train = np.concatenate ( y_train_aux )
    y_test = np.concatenate ( y_test_aux )
    
    return X_train , X_test , y_train , y_test 

## Data Analysis

In this section we will run a Cross Validation routine 

In [9]:
# from tpot import TPOTClassifier
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = preprocess()

# tpot = TPOTClassifier(generations=5, population_size=20, 
#                       verbosity=2,max_eval_time_mins=20,
#                       max_time_mins=100,scoring='f1_micro',
#                       random_state = 17)
# tpot.fit(X_train, y_train)
# print(tpot.score(X_test, y_test))
# tpot.export('FinalPipeline.py')

In [10]:
from sklearn.ensemble import  RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
import xgboost as xgb
from xgboost.sklearn import  XGBClassifier



In [11]:
# Train and test a classifier

# Pass in the classifier so we can iterate over many seed later.
def train_and_test(X_tr, y_tr, X_v, well_v, clf):
    
    # Feature normalization
    scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_tr)
    X_tr = scaler.transform(X_tr)
    X_v = scaler.transform(X_v)
    
    clf.fit(X_tr, y_tr)
    
    # Test classifier
    y_v_hat = clf.predict(X_v)
    
    # Clean isolated facies for each well
    for w in np.unique(well_v):
        y_v_hat[well_v==w] = medfilt(y_v_hat[well_v==w], kernel_size=5)
    
    return y_v_hat

In [12]:
# Train and test a classifier

# Pass in the classifier so we can iterate over many seed later.
def train_and_test_non_validation(X_tr, y_tr, X_v, well_v, clf):
    
    # Feature normalization
    scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_tr)
    X_tr = scaler.transform(X_tr)
    X_v = scaler.transform(X_v)
    
    clf.fit(X_tr, y_tr)
    
    # Test classifier
    y_v_hat = clf.predict(X_v)
    
    # Clean isolated facies for each well
#     for w in np.unique(well_v):
#         y_v_hat[well_v==w] = medfilt(y_v_hat[well_v==w], kernel_size=5)
    
    return y_v_hat

## Prediction

In [13]:
#Load testing data
test_data = pd.read_csv('../validation_data_nofacies.csv')

    # Train classifier
    #clf = make_pipeline(make_union(VotingClassifier([("est", ExtraTreesClassifier(criterion="gini", max_features=1.0, n_estimators=500))]), FunctionTransformer(lambda X: X)), XGBClassifier(learning_rate=0.73, max_depth=10, min_child_weight=10, n_estimators=500, subsample=0.27))
    #clf =  make_pipeline( KNeighborsClassifier(n_neighbors=5, weights="distance") ) 
    #clf = make_pipeline(MaxAbsScaler(),make_union(VotingClassifier([("est", RandomForestClassifier(n_estimators=500))]), FunctionTransformer(lambda X: X)),ExtraTreesClassifier(criterion="entropy", max_features=0.0001, n_estimators=500))
    # * clf = make_pipeline( make_union(VotingClassifier([("est", BernoulliNB(alpha=60.0, binarize=0.26, fit_prior=True))]), FunctionTransformer(lambda X: X)),RandomForestClassifier(n_estimators=500))

# # Prepare training data
# X_tr = X
# y_tr = y

# # Augment features
# X_tr, padded_rows = augment_features(X_tr, well, depth)

# # Removed padded rows
# X_tr = np.delete(X_tr, padded_rows, axis=0)
# y_tr = np.delete(y_tr, padded_rows, axis=0) 

# Prepare test data
well_ts = test_data['Well Name'].values
depth_ts = test_data['Depth'].values
X_ts = test_data[feature_names].values


    
y_pred = []
print('.' * 100)
for seed in range(3):
    np.random.seed(seed)
    
    # Hold out two wells
    ind_shk = np.array(data[data['Well Name']=='SHANKLE'].index)
    ind_chr = np.array(data[data['Well Name']=='CHURCHMAN BIBLE'].index)
    ind_ho_set = np.append(ind_shk,ind_chr)

    # Make training data.
    X_train, padded_rows = augment_features(X, well, depth)
    y_train = y
    X_test_nv = np.take(X_train, ind_ho_set, axis=0) 
    y_test_nv = np.take(y_train, ind_ho_set, axis=0) 
    X_train_nv = np.delete(X_train, ind_ho_set, axis=0)
    y_train_nv = np.delete(y_train, ind_ho_set, axis=0)
    
    X_train_nv = np.delete(X_train_nv, padded_rows, axis=0)
    y_train_nv = np.delete(y_train_nv, padded_rows, axis=0) 
    
    X_test_nv = np.delete(X_test_nv, padded_rows, axis=0)
    y_test_nv = np.delete(y_test_nv, padded_rows, axis=0)

    # Train classifier  
    clf = make_pipeline(XGBClassifier(learning_rate=0.12,
                                      max_depth=3,
                                      min_child_weight=10,
                                      n_estimators=150,
                                      seed=seed,
                                      colsample_bytree=0.9))

    
    
    # Make blind data.
#     X_test, _ = augment_features(X_ts, well_ts, depth_ts)
    

    # Train and test.
#     y_ts_hat = train_and_test(X_train, y_train, X_test, well_ts, clf)
    
    # Collect result.
#     y_pred.append(y_ts_hat)
#     print('|', end='')
    
# np.save('LA_Team_100_realizations.npy', y_pred)

....................................................................................................


In [14]:
set(padded_rows) & set(ind_ho_set)

{937, 3745, 3784}

In [15]:
padded_rows.shape

(18,)

In [16]:
X_test_nv.shape

(850, 28)

In [17]:
ind_ho_set.shape

(853,)

In [18]:
y_train.shape

(4149,)

In [19]:
# X_train_nv, X_test_nv, y_train_nv, y_test_nv = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
# np.delete(X_train, ind_ho_set, axis=0)



In [20]:
y_pred = train_and_test_non_validation(X_train_nv, y_train_nv, X_test_nv, well_ts, clf)

In [21]:
correct = 0
tot = 0
for i, entry in enumerate(confusion_matrix(y_pred,y_test_nv)):
    for j, e in enumerate(entry):
        if i == j:
            correct += e
        tot += e
print(correct/tot)

0.571764705882


In [22]:
f1_score(y_pred,y_test_nv, average='micro')

0.57176470588235295

In [23]:
accuracy_score(y_pred,y_test_nv)

0.57176470588235295

### attempt at using LSTM for including influence of previous features

In [216]:
# Feature windows concatenation function
def augment_features_window(X, N_neig):
    
    # Parameters
    N_row = X.shape[0]
    N_feat = X.shape[1]

    # Zero padding
    X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))

    # Loop over windows
    X_aug = np.zeros((N_row, (2*N_neig+1), N_feat))
    for r in np.arange(N_row)+N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
#         print(this_row.shape)
        this_row.shape = ((2*N_neig+1), this_row.size // (2*N_neig+1))
#         print(this_row)
        X_aug[r-N_neig] = this_row

    return X_aug


# Feature augmentation function
def augment_features(X, well, depth, N_neig=6):
    
    # Augment features
    X_aug = np.zeros((X.shape[0], (N_neig*2+1), X.shape[1]))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
#         X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        X_aug[w_idx, :] = X_aug_win
    
    # Find padded rows
    padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
    
    return X_aug, padded_rows

In [25]:
# model.save('LSTM_acc_74.h5', overwrite=True)
# y_pred = model.predict(X_test_nv_LSTM, batch_size=20, verbose=0)
# y_test_LSTM_ct.shape
# predicted_classes = np.argmax(y_pred, axis=1)
# class_labels = np.argmax(y_test_LSTM_ct, axis=1)
# f1_score(predicted_classes,class_labels, average='micro') # micro is the same as accuracy in this problem

### Selection to apply LSTM w held out well 

In [26]:
#Load Data
data = pd.read_csv('../facies_vectors.csv')
# data = pd.read_csv('../ShiangYong/facies_vectors_imputedPE.csv')
# Parameters
feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

# data.dropna(inplace=True)
# Store features and labels
X = data[feature_names].values 
y = data['Facies'].values 

# Store well labels and depths
well = data['Well Name'].values
depth = data['Depth'].values
# X = np.array(pd.DataFrame(X).dropna())
# Fill 'PE' missing values with mean
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)

# NEEDS TO BE CHANGED, SCALING SHOULD NOT BE DETERMINED FROM TESTING AND TRAINING SET, ONLY TRAINING
# scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X)
# X = scaler.transform(X)

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
# X_test_nv_LSTM = scaler.transform(X)

Get indicies of wells that will make up hold out set

In [27]:
X_train, padded_rows = augment_features(X, well, depth)
y_train = y
X_test_LSTM = np.take(X_train, ind_ho_set, axis=0) 
y_test_LSTM = np.take(y_train, ind_ho_set, axis=0) 
X_train_LSTM = np.delete(X_train, ind_ho_set, axis=0)
y_train_LSTM = np.delete(y_train, ind_ho_set, axis=0)

In [28]:
X_train.shape

(4149, 25, 7)

In [29]:
X_test_LSTM = X_test_LSTM[0:840]
y_test_LSTM = y_test_LSTM[0:840]
X_train_LSTM = X_train_LSTM[0:3280]
y_train_LSTM = y_train_LSTM[0:3280]

In [30]:
# X_train_nv_LSTM, X_test_nv_LSTM, y_train_nv_LSTM, y_test_nv_LSTM = train_test_split(X_train[0:4000], y_train[0:4000], test_size=0.3, random_state=42)
y_train_LSTM = y_train_LSTM - 1
y_test_LSTM = y_test_LSTM - 1
y_train_LSTM_ct = to_categorical(np.array(y_train_LSTM), num_classes=None)
y_test_LSTM_ct = to_categorical(np.array(y_test_LSTM), num_classes=None)

In [None]:
'''Trains an LSTM model on the IMDB sentiment classification task.
The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
# Notes
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''




data_dim = 7 # Features
timesteps = 25 # 25 is best so far observed
num_classes = 9
batch_size = 20 # 20 is best so far observed

# Expected input batch shape: (batch_size, timesteps, data_dim)
# Note that we have to provide the full batch_input_shape since the network is stateful.
# the sample of index i in batch k is the follow-up for the sample i in batch k-1.
model = Sequential()


model.add(LSTM(40, stateful=True, return_sequences=True,
               batch_input_shape=(batch_size, timesteps, data_dim)))

# model.add(LSTM(40, return_sequences=True, stateful=True))

model.add(LSTM(40, stateful=True))
model.add(Dense(9, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='Nadam',
              metrics=['accuracy'])

model.fit(X_train_LSTM, y_train_LSTM_ct,
          batch_size=batch_size, epochs=30, shuffle=False,
          validation_data=(X_test_LSTM, y_test_LSTM_ct))

### Selection to apply LSTM w held out wells from competition

In [31]:
#Load Data
data = pd.read_csv('../facies_vectors.csv')
data_valid_labels = pd.read_csv('../blind_stuart_crawford_core_facies.csv')
data_valid = pd.read_csv('../validation_data_nofacies.csv')

In [32]:
data.head()

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
0,3,A1 SH,SHRIMPLIN,2793.0,77.45,0.664,9.9,11.915,4.6,1,1.0
1,3,A1 SH,SHRIMPLIN,2793.5,78.26,0.661,14.2,12.565,4.1,1,0.979
2,3,A1 SH,SHRIMPLIN,2794.0,79.05,0.658,14.8,13.05,3.6,1,0.957
3,3,A1 SH,SHRIMPLIN,2794.5,86.1,0.655,13.9,13.115,3.5,1,0.936
4,3,A1 SH,SHRIMPLIN,2795.0,74.58,0.647,13.5,13.3,3.4,1,0.915


In [33]:
data_valid.head()

Unnamed: 0,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
0,A1 SH,STUART,2808.0,66.276,0.63,3.3,10.65,3.591,1,1.0
1,A1 SH,STUART,2808.5,77.252,0.585,6.5,11.95,3.341,1,0.978
2,A1 SH,STUART,2809.0,82.899,0.566,9.4,13.6,3.064,1,0.956
3,A1 SH,STUART,2809.5,80.671,0.593,9.5,13.25,2.977,1,0.933
4,A1 SH,STUART,2810.0,75.971,0.638,8.7,12.35,3.02,1,0.911


In [34]:
data_valid_labels.head()

Unnamed: 0,WellName,Depth.ft,LithCode,LithLabel
0,STUART,2807.5,3,NM Shly Silt
1,STUART,2808.0,3,NM Shly Silt
2,STUART,2808.5,3,NM Shly Silt
3,STUART,2809.0,3,NM Shly Silt
4,STUART,2809.5,3,NM Shly Silt


In [35]:
data_validation = pd.merge(data_valid, data_valid_labels,  how='left', left_on=['Well Name','Depth'], right_on = ['WellName','Depth.ft'])

In [145]:
data_validation = data_validation[data_validation.Facies <= 9]

In [146]:
data_validation.head()

Unnamed: 0,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,Facies
0,A1 SH,STUART,2808.0,66.276,0.63,3.3,10.65,3.591,1,1.0,3
1,A1 SH,STUART,2808.5,77.252,0.585,6.5,11.95,3.341,1,0.978,3
2,A1 SH,STUART,2809.0,82.899,0.566,9.4,13.6,3.064,1,0.956,3
3,A1 SH,STUART,2809.5,80.671,0.593,9.5,13.25,2.977,1,0.933,3
4,A1 SH,STUART,2810.0,75.971,0.638,8.7,12.35,3.02,1,0.911,3


In [147]:
data_validation.drop(['WellName','Depth.ft','LithLabel'],axis=1, inplace=True)

ValueError: labels ['WellName' 'Depth.ft' 'LithLabel'] not contained in axis

In [148]:
data_validation.rename(columns={"LithCode": "Facies"}, inplace=True)
data_validation.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [178]:
data_validation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 0 to 829
Data columns (total 11 columns):
Formation    800 non-null object
Well Name    800 non-null object
Depth        800 non-null float64
GR           800 non-null float64
ILD_log10    800 non-null float64
DeltaPHI     800 non-null float64
PHIND        800 non-null float64
PE           800 non-null float64
NM_M         800 non-null int64
RELPOS       800 non-null float64
Facies       800 non-null int64
dtypes: float64(7), int64(2), object(2)
memory usage: 75.0+ KB


In [179]:
data_validation['Facies'] = data_validation['Facies'].astype("int")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [180]:
data_validation.head()

Unnamed: 0,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,Facies
0,A1 SH,STUART,2808.0,66.276,0.63,3.3,10.65,3.591,1,1.0,3
1,A1 SH,STUART,2808.5,77.252,0.585,6.5,11.95,3.341,1,0.978,3
2,A1 SH,STUART,2809.0,82.899,0.566,9.4,13.6,3.064,1,0.956,3
3,A1 SH,STUART,2809.5,80.671,0.593,9.5,13.25,2.977,1,0.933,3
4,A1 SH,STUART,2810.0,75.971,0.638,8.7,12.35,3.02,1,0.911,3


In [181]:
data.head()

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
0,3,A1 SH,SHRIMPLIN,2793.0,77.45,0.664,9.9,11.915,4.6,1,1.0
1,3,A1 SH,SHRIMPLIN,2793.5,78.26,0.661,14.2,12.565,4.1,1,0.979
2,3,A1 SH,SHRIMPLIN,2794.0,79.05,0.658,14.8,13.05,3.6,1,0.957
3,3,A1 SH,SHRIMPLIN,2794.5,86.1,0.655,13.9,13.115,3.5,1,0.936
4,3,A1 SH,SHRIMPLIN,2795.0,74.58,0.647,13.5,13.3,3.4,1,0.915


In [182]:
data_valid.shape

(830, 10)

In [217]:
#Load Data
# data = pd.read_csv('../facies_vectors.csv')
# data_valid_labels = pd.read_csv('../blind_stuart_crawford_core_facies.csv')
# data_valid = pd.read_csv('../validation_data_nofacies.csv')
# Parameters
feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

# data.dropna(inplace=True)
# Store features and labels
X = data[feature_names].values 
y = data['Facies'].values 
y_test = data_validation['Facies'].values

# Store well labels and depths
well = data['Well Name'].values
depth = data['Depth'].values
# X = np.array(pd.DataFrame(X).dropna())
# Fill 'PE' missing values with mean
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)

# NEEDS TO BE CHANGED, SCALING SHOULD NOT BE DETERMINED FROM TESTING AND TRAINING SET, ONLY TRAINING
# scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X)
# X = scaler.transform(X)

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
# X_test_nv_LSTM = scaler.transform(X)


# Repeat process for held out data
# data.dropna(inplace=True)
# Store features and labels
X_ho = data_validation[feature_names].values 
y_ho = data_validation['Facies'].values 

# Store well labels and depths
well_ho = data_validation['Well Name'].values
depth_ho = data_validation['Depth'].values
# X = np.array(pd.DataFrame(X).dropna())
# Fill 'PE' missing values with mean
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X_ho = imp.transform(X_ho)

# NEEDS TO BE CHANGED, SCALING SHOULD NOT BE DETERMINED FROM TESTING AND TRAINING SET, ONLY TRAINING
# scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X)
# X = scaler.transform(X)

# scaler = preprocessing.StandardScaler().fit(X)
X_ho = scaler.transform(X_ho)
# X_test_nv_LSTM = scaler.transform(X)

In [218]:
X_train, padded_rows = augment_features(X, well, depth)
y_train = y

X_test, padded_rows = augment_features(X_ho, well_ho, depth_ho)
# y_test = y
# X_test_LSTM = np.take(X_train, ind_ho_set, axis=0) 
# y_test_LSTM = np.take(y_train, ind_ho_set, axis=0) 
# X_train_LSTM = np.delete(X_train, ind_ho_set, axis=0)
# y_train_LSTM = np.delete(y_train, ind_ho_set, axis=0)

In [219]:
X_ho.shape

(800, 7)

In [220]:
well_ho.shape

(800,)

In [221]:
depth_ho.shape

(800,)

In [222]:
X_train.shape

(4149, 13, 7)

In [223]:
X_test.shape

(800, 13, 7)

In [224]:
X_test = X_test[0:800]
y_test = y_test[0:800]
X_train = X_train[0:4120]
y_train = y_train[0:4120]

In [228]:
y_train = y_train - 1
y_test = y_test - 1
y_train_ct = to_categorical(np.array(y_train), num_classes=None)
y_test_ct = to_categorical(np.array(y_test), num_classes=None)

In [229]:
'''Trains an LSTM model on the IMDB sentiment classification task.
The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
# Notes
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''




data_dim = 7 # Features
timesteps = 13 # 25 is best so far observed
num_classes = 9
batch_size = 20 # 20 is best so far observed

# Expected input batch shape: (batch_size, timesteps, data_dim)
# Note that we have to provide the full batch_input_shape since the network is stateful.
# the sample of index i in batch k is the follow-up for the sample i in batch k-1.
model = Sequential()


model.add(LSTM(40, stateful=True, return_sequences=True,
               batch_input_shape=(batch_size, timesteps, data_dim)))

# model.add(LSTM(40, return_sequences=True, stateful=True))

# model.add(LSTM(40, stateful=True))
model.add(Dense(9, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='Nadam',
              metrics=['accuracy'])

model.fit(X_train, y_train_ct,
          batch_size=batch_size, epochs=30, shuffle=False,
          validation_data=(X_test, y_test_ct))

ValueError: Error when checking target: expected dense_15 to have 3 dimensions, but got array with shape (4120, 8)

In [227]:
np.array(data_validation.Facies).shape

(800,)

In [82]:
y_test.shape

(800,)

In [115]:
y_test.min()

0

In [129]:
data_validation[data_validation.Facies == 11]

Unnamed: 0,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,Facies
546,A1 LM,CRAWFORD,3008.5,58.369,0.504,0.715,14.235,2.83,2,0.311,11
547,A1 LM,CRAWFORD,3009.0,56.125,0.435,-0.775,13.415,2.786,2,0.302,11
548,A1 LM,CRAWFORD,3009.5,56.769,0.378,-1.155,12.805,2.726,2,0.292,11
549,A1 LM,CRAWFORD,3010.0,62.587,0.298,-0.18,12.87,2.588,2,0.283,11
550,A1 LM,CRAWFORD,3010.5,64.674,0.252,0.24,14.42,2.465,2,0.274,11
584,B1 LM,CRAWFORD,3036.5,62.331,0.834,-0.245,9.835,2.813,2,0.818,11
585,B1 LM,CRAWFORD,3037.0,60.946,0.884,-1.69,16.08,2.745,2,0.795,11
587,B1 LM,CRAWFORD,3038.0,45.287,0.922,-0.86,18.44,3.962,2,0.75,11
588,B1 LM,CRAWFORD,3038.5,30.049,0.925,0.445,13.565,4.571,2,0.727,11


In [142]:
data_valid_labels[data_valid_labels.LithCode == 1]

Unnamed: 0,WellName,Depth.ft,LithCode,LithLabel
22,STUART,2819.0,1,NM Sand
23,STUART,2819.5,1,NM Sand
24,STUART,2820.0,1,NM Sand
25,STUART,2820.5,1,NM Sand
26,STUART,2821.0,1,NM Sand
27,STUART,2821.5,1,NM Sand
28,STUART,2822.0,1,NM Sand
29,STUART,2822.5,1,NM Sand
700,CRAWFORD,3099.5,1,NM Sand
701,CRAWFORD,3100.0,1,NM Sand
