In [1]:
# return a dataframe
from train_test_val import main
# return train, train_labels, test, test_labels, val, val_labels
from train_test_val import train_test_val_split

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from classification_utilities import display_cm, display_adj_cm
from sklearn.model_selection import train_test_split
import numpy as np
import os

path = r"C:\Users\Dunbar\Dropbox\Work\FORCE"
infile = "train.csv"
infile_path = os.path.join(path, infile)

df = main(infile_path, 51, 0.33)

Filling missing values from curves:
['RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'NPHI', 'PEF', 'DTC']
----------------------------------------
Group by: ['WELL', 'FORMATION']
Total number of missing values: 1118313
Group by: ['WELL', 'GROUP']
Total number of missing values: 981495
Group by: ['WELL']
Total number of missing values: 415425
Group by: ['FORMATION']
Total number of missing values: 6057
Group by: ['GROUP']
Total number of missing values: 0


In [2]:
df["AI"] = df.DTC * df.RHOB

In [3]:
def numericalise(df, string):
    _string = df[string].unique()
    _dict = dict(zip(_string, range(len(_string))))
    df = df.replace(_dict)
    return df

df = numericalise(df, "FORMATION")

df = numericalise(df, "GROUP")

In [4]:
curves = ['DTC', 'FORMATION', 'GR', 'GROUP', 'NPHI',
          'PEF', 'RDEP', 'RHOB', 'RMED', 'RSHA', 'AI', 'FORCE_2020_LITHOFACIES_LITHOLOGY']
# curves = ['RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'AI']
# curves = ['RSHA', 'RMED', 'RDEP', 'GR', 'NPHI', 'PEF', 'AI']
# curves = ['RDEP', 'GR', 'NPHI', 'PEF', 'AI']

In [5]:
from sklearn.utils import shuffle
X = df[curves]
X = shuffle(X)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

y = X.pop('FORCE_2020_LITHOFACIES_LITHOLOGY')

X = scaler.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
A = np.load('../penalty_matrix.npy')

def score(y_true, y_pred):
    S = 0.0
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    for i in range(0, y_true.shape[0]):
        S -= A[y_true[i], y_pred[i]]
    return S/y_true.shape[0]


def make_model(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    print("Accuracy on training set: {:.3f}".format(model.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(model.score(X_test, y_test)))
    print("Feature importances:\n{}".format(model.feature_importances_))
    predictions = model.predict(X_test)
    conf = confusion_matrix(y_test, predictions)
    print(display_cm(conf, facies_labels, hide_zeros=True))
    #print(confusion_matrix(y_test, predictions))
    print(score(y_test.values, predictions))

facies_labels = ['Sst', 'S-Sh', 'Sh', 'Mrl', 'Dol', 'Lst', 'Chk', 'Hal', 'Anh', 'Tuf', 'Coa', 'Bmt']

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
model = RandomForestClassifier(random_state=0)
make_model(model, X_train, y_train, X_test, y_test)



Accuracy on training set: 0.994
Accuracy on test set: 0.915
Feature importances:
[0.08336037 0.07398318 0.15550593 0.06494635 0.11120712 0.07293979
 0.0873788  0.08410301 0.08094426 0.07567974 0.10995145]
     Pred   Sst  S-Sh    Sh   Mrl   Dol   Lst   Chk   Hal   Anh   Tuf   Coa   Bmt Total
     True
      Sst  8964   451   581     3          41                       3    17       10060
     S-Sh   588  7457   762     6     2    56                       2     4        8877
       Sh   656   698 38955    84     9   177     1     3     2    50    16       40651
      Mrl     7    11   171  1846          98     7                                2140
      Dol     1     4    48     2    18     5                 1     1                80
      Lst    87   122   633   110        2796    26                20              3794
      Chk     1                 6          35   653                                 695
      Hal                 6                           567     3                  

In [11]:
model = RandomForestClassifier(
    n_estimators=100,
    criterion='entropy',#'gini',
    max_depth=30, #None
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
)
make_model(model, X_train, y_train, X_test, y_test)

Accuracy on training set: 0.999
Accuracy on test set: 0.923
Feature importances:
[0.08024232 0.0921554  0.16246855 0.09220315 0.09372069 0.0682236
 0.07164119 0.08248918 0.07479347 0.07579769 0.10626476]
     Pred   Sst  S-Sh    Sh   Mrl   Dol   Lst   Chk   Hal   Anh   Tuf   Coa   Bmt Total
     True
      Sst  8982   436   584     2          37                       5    14       10060
     S-Sh   473  7546   787     6     2    53                       2     8        8877
       Sh   508   541 39272    81     7   168     1     2     1    49    21       40651
      Mrl     5     8   138  1878         105     6                                2140
      Dol           4    46     2    24     4                                        80
      Lst    69   119   591    87        2872    34                22              3794
      Chk                 1     4          38   652                                 695
      Hal                 6                           566     4                   

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
model = RandomForestClassifier(
    n_estimators=200,
    criterion='entropy',#'gini',
    max_depth=30, #None
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
)
make_model(model, X_train, y_train, X_test, y_test)