In [1]:
# return a dataframe
from train_test_val import main
# return train, train_labels, test, test_labels, val, val_labels
from train_test_val import train_test_val_split

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from classification_utilities import display_cm, display_adj_cm
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os

path = r"C:\Users\Dunbar\Dropbox\Work\FORCE"
infile = "train.csv"
infile_path = os.path.join(path, infile)

In [13]:
lithology_numbers = {30000: 0,
                 65030: 1,
                 65000: 2,
                 80000: 3,
                 74000: 4,
                 70000: 5,
                 70032: 6,
                 88000: 7,
                 86000: 8,
                 99000: 9,
                 90000: 10,
                 93000: 11}

In [2]:
# df = main(infile_path, 51, 1.0)
df = pd.read_csv(infile_path, sep=';')

In [3]:
df["AI"] = df.DTC * df.RHOB

In [4]:
def numericalise(df, string):
    _string = df[string].unique()
    _dict = dict(zip(_string, range(len(_string))))
    df = df.replace(_dict)
    return df

In [5]:
df = numericalise(df, "FORMATION")

df = numericalise(df, "GROUP")

In [6]:
curves = ['DTC', 'FORMATION', 'GR', 'GROUP', 'NPHI',
          'PEF', 'RDEP', 'RHOB', 'RMED', 'RSHA', 'AI', 'FORCE_2020_LITHOFACIES_LITHOLOGY']
# curves = ['RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'AI']
# curves = ['RSHA', 'RMED', 'RDEP', 'GR', 'NPHI', 'PEF', 'AI']
# curves = ['RDEP', 'GR', 'NPHI', 'PEF', 'AI']

In [14]:
from sklearn.utils import shuffle
X = df[curves]
X = shuffle(X)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

y = X.pop('FORCE_2020_LITHOFACIES_LITHOLOGY')
y = y.map(lithology_numbers)
X = scaler.fit_transform(X)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [9]:
A = np.load('../penalty_matrix.npy')

def score(y_true, y_pred):
    S = 0.0
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    for i in range(0, y_true.shape[0]):
        S -= A[y_true[i], y_pred[i]]
    return S/y_true.shape[0]


def make_model(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    print("Accuracy on training set: {:.3f}".format(model.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(model.score(X_test, y_test)))
    #print("Feature importances:\n{}".format(model.feature_importances_))
    predictions = model.predict(X_test)
    conf = confusion_matrix(y_test, predictions)
    print(display_cm(conf, facies_labels, hide_zeros=True))
    #print(confusion_matrix(y_test, predictions))
    print(score(y_test.values, predictions))

facies_labels = ['Sst', 'S-Sh', 'Sh', 'Mrl', 'Dol', 'Lst', 'Chk', 'Hal', 'Anh', 'Tuf', 'Coa', 'Bmt']

### Cons
> Sensitive to noisy data. It can overfit noisy data.
> The small variation(or variance) in data can result in the different decision tree. This can be reduced by bagging and boosting algorithms.
> Decision trees are biased with imbalance dataset, so it is recommended that balance out the dataset before creating the decision tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
model = DecisionTreeClassifier(random_state=0)
make_model(model, X_train, y_train, X_test, y_test)

Accuracy on training set: 1.000
Accuracy on test set: 0.895
     Pred   Sst  S-Sh    Sh   Mrl   Dol   Lst   Chk   Hal   Anh   Tuf   Coa   Bmt Total
     True
      Sst 14765  1194   840    26    10   210     2                10    24       17081
     S-Sh  1071 11583  1951    55     8   168                      10    21       14867
       Sh   865  2025 67941   315    65   724     1     2     4   147    61       72150
      Mrl    34    58   283  2772     2   188     7                                3344
      Dol    20     7    61     3    63    12                 4                     170
      Lst   165   186   679   218    10  4187    62                26     1        5534
      Chk     1     3     2     5          65   976                                1052
      Hal                 2                           770     1                     773
      Anh           1     1           1                 6    98                     107
      Tuf    23     4   149           2    26     

In [17]:
#100% of available data, of which 10% test
model = DecisionTreeClassifier(
    criterion='entropy', #"gini"
    splitter='best',  #'best'
    max_depth=30,#None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort=False,
)
make_model(model, X_train, y_train, X_test, y_test)

Accuracy on training set: 0.999
Accuracy on test set: 0.903
     Pred   Sst  S-Sh    Sh   Mrl   Dol   Lst   Chk   Hal   Anh   Tuf   Coa   Bmt Total
     True
      Sst 14949  1058   840    20     7   165     3                17    22       17081
     S-Sh   956 11768  1911    46     7   164                       5    10       14867
       Sh   803  1857 68340   286    68   608           4     1   127    56       72150
      Mrl    26    40   267  2803     5   198     5                                3344
      Dol    12    17    53     6    71    11                                       170
      Lst   161   173   655   205    10  4233    78                18     1        5534
      Chk                 3     3          63   983                                1052
      Hal     1           3                           769                           773
      Anh                 1           3                 6    97                     107
      Tuf    11     8   125     1          24     

In [None]:
open_test_features = pd.read_csv('../test.csv', sep=';')

# open_test_features.head()

print(open_test_features.columns)

open_test_features["AI"] = open_test_features.DTC * open_test_features.RHOB
open_test_features = numericalise(open_test_features, "FORMATION")
open_test_features = numericalise(open_test_features, "GROUP")
curves_test = ['DTC', 'FORMATION', 'GR', 'GROUP', 'NPHI',
               'PEF', 'RDEP', 'RHOB', 'RMED', 'RSHA', 'AI']
open_test_features = open_test_features[curves_test]
open_test_features

open_test_features = scaler.transform(open_test_features)
open_test_features

test_prediction = model.predict(open_test_features)

# test_prediction



category_to_lithology = {y:x for x,y in lithology_numbers.items()}

test_prediction_for_submission = np.vectorize(category_to_lithology.get)(test_prediction)

test_prediction_for_submission

np.savetxt('test_predictions_rf_entropy.csv', test_prediction_for_submission, header='lithology', comments='', fmt='%i')

In [18]:
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=1000,
    max_samples=20000, bootstrap=True, n_jobs=-1)

make_model(bag_clf, X_train, y_train, X_test, y_test)

Accuracy on training set: 0.879
Accuracy on test set: 0.876
     Pred   Sst  S-Sh    Sh   Mrl   Dol   Lst   Chk   Hal   Anh   Tuf   Coa   Bmt Total
     True
      Sst 14414  1023  1536    17          46     2           3    35     5       17081
     S-Sh  1449  9393  3886    69          43     1                23     3       14867
       Sh   639  1131 69930   172     1    90     7     4     1   164    11       72150
      Mrl    74    53   742  2161         292    22                                3344
      Dol    35    12    96     1    14    10                 2                     170
      Lst   337   220  1103   215        3562    57           4    36              5534
      Chk     9           4     4         148   887                                1052
      Hal     1           3                           767     2                     773
      Anh     3                                         6    98                     107
      Tuf    64     2   317                 2     

### Cross-validation

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [20]:
#100% of data
kfold = KFold(n_splits=5)
scores = cross_val_score(DecisionTreeClassifier(criterion='entropy',
                                                splitter='best',
                                                max_depth=30),
                         X, y, cv=kfold)
print(f"Cross-validation scores: {scores}")

Cross-validation scores: [0.89839515 0.8995566  0.89877489 0.89745923 0.89886033]


In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=30), n_estimators=1000,
    algorithm="SAMME.R", learning_rate=0.1)

ada_clf.fit(X_train, y_train)

In [None]:
print("Accuracy on training set: {:.3f}".format(ada_clf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(ada_clf.score(X_test, y_test)))

predictions = ada_clf.predict(X_test)
conf = confusion_matrix(y_test, predictions)
print(display_cm(conf, facies_labels, hide_zeros=True))

print(score(y_test.values, predictions))

In [None]:
open_test_features = pd.read_csv('../test.csv', sep=';')

# open_test_features.head()

# print(open_test_features.columns)

open_test_features["AI"] = open_test_features.DTC * open_test_features.RHOB
open_test_features = numericalise(open_test_features, "FORMATION")
open_test_features = numericalise(open_test_features, "GROUP")
curves_test = ['DTC', 'FORMATION', 'GR', 'GROUP', 'NPHI',
               'PEF', 'RDEP', 'RHOB', 'RMED', 'RSHA', 'AI']
open_test_features = open_test_features[curves_test]
# print(open_test_features)

open_test_features = scaler.transform(open_test_features)
# print(open_test_features)

test_prediction = ada_clf.predict(open_test_features)

# print(test_prediction)

category_to_lithology = {y:x for x,y in lithology_numbers.items()}

test_prediction_for_submission = np.vectorize(category_to_lithology.get)(test_prediction)

test_prediction_for_submission

np.savetxt('test_predictions_ada-rf.csv', test_prediction_for_submission, header='lithology', comments='', fmt='%i')