In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import pickle
from tqdm import tqdm
from scipy.stats import mode

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

In [3]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [4]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, mean_squared_error, mean_absolute_error, f1_score

### Preparing data for training
[back to table of contents](#contents)

Selecting features for the model, scaling and train and test split

In [5]:
MAPPING = {
    'Continental': 1,
    'Transitional': 2,
    'Marine': 3,
}


def add_features(df, half_window=5):
    for feature in ['GR', 'RT', 'CN', 'DEN']:
        for i in range(1, half_window+1):
            df[f'{feature}_{-i}'] = 0.0
            df[f'{feature}_{i}'] = 0.0 
        df.loc[:, f'{feature}_mean'] = 0.0
        
        for well in set(df.WELL):            
            for i in range(1,half_window+1):
                df.loc[:, f'{feature}_{-i}'][df.WELL==well] = df[df.WELL==well][feature].shift(-i, fill_value=df[df.WELL==well][feature].iloc[-1])
                df.loc[:, f'{feature}_{i}'][df.WELL==well] = df[df.WELL==well][feature].shift(i, fill_value=df[df.WELL==well][feature].iloc[0])   

            df.loc[:, f'{feature}_mean'][df.WELL==well] = df[df.WELL==well][feature].rolling(2*half_window+1, 1, True).mean()

    df['D_Env'] = df['DEPOSITIONAL_ENVIRONMENT'].apply(lambda x: MAPPING[x])
    df['GRm/RTm'] = df['GR_mean']/df['RT_mean']
    df['RTm/CNm'] = df['RT_mean']/df['CN_mean']
    df['RTm/DENm'] = df['RT_mean']/df['DEN_mean']
    
    return df, sorted(list(set(df.columns)-set(['WELL',
                                     'X',
                                     'Y',
                                     'DEPOSITIONAL_ENVIRONMENT',
                                     'LITH_NAME',
                                     'LITH_CODE'])))



In [6]:
df = pd.read_csv('Train-dataset.csv')
df, features_names = add_features(df)

X = df[features_names]
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
y = df['LITH_CODE'].values

In [7]:
# df = pd.read_csv('Train-dataset.csv')
# df, features_names = add_features(df)

# # best_features = sorted(tmp_mean.values.tolist(), key=lambda x: -x[1])[:15]
# # best_features = [f[0] for f in best_features]
# best_features = ['GR',
#  'MD',
#  'DEN',
#  'RT',
#  'DEN_1',
#  'GR_mean',
#  'GR_-1',
#  'RT_-1',
#  'DEN_mean',
#  'GR_5',
#  'D_Env',
#  'DEN_-2',
#  'RTm/DENm',
#  'GRm/RTm',
#  'DEN_-1'
# ]

# features_names = best_features

# X = df[best_features]
# scaler = preprocessing.StandardScaler().fit(X)
# X = scaler.transform(X)
# y = df['LITH_CODE'].values

In [8]:
# models = []

# for turn_i in range(10):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=turn_i*10)

#     ros = RandomOverSampler()
#     _, _ = ros.fit_resample(X_train, y_train)
#     train_idx = ros.sample_indices_
#     _, _ = ros.fit_resample(X_test, y_test)
#     val_idx = ros.sample_indices_
    
#     x_train, x_val = X_train[train_idx], X_test[val_idx]
#     y_train, y_val = y_train[train_idx], y_test[val_idx]
    
# #     x_train, x_val = X_train, X_test
# #     y_train, y_val = y_train, y_test
    
    
#     model = CatBoostClassifier(iterations=50, verbose=False, random_state=turn_i*10)
#     model.fit(x_train, y_train)
#     val_pred = model.predict(x_val)
    
#     tr_mmap = f1_score(y_train, model.predict(x_train), average='weighted')
#     tr_wmap = f1_score(y_train, model.predict(x_train), average='macro')
#     tst_mmap = f1_score(y_val, model.predict(x_val), average='weighted')
#     tst_wmap = f1_score(y_val, model.predict(x_val), average='macro')

#     models.append(model)
# #     model.save_model(f"model_{turn_i}.cbm")
#     print(f'Turn: {turn_i} tr_mmap: {tr_mmap:.3f} tst_mmap: {tst_mmap:.3f} tr_wmap: {tr_wmap:.3f} tst_wmap: {tst_wmap:.3f}')

In [9]:
# imp_mean = np.stack([model.feature_importances_ for model in models]).mean(axis=0)
# imp_0 = models[0].feature_importances_

# tmp_mean = pd.DataFrame(data=[
#     [features_names[i], imp_mean[i]] for i in range(len(features_names))], columns=['name', 'importance'])

# sorted(tmp_mean.values.tolist(), key=lambda x: -x[1])[:15]

In [10]:
# predictions = []
# for model in models:
#     predictions.append(model.predict(x_val))
    
# y_pred = mode(np.stack(predictions), 0).mode.reshape(-1)
# f1_score(y_val, y_pred, average='weighted'), f1_score(y_val, y_pred, average='macro')

In [11]:
models = []
for turn_i in tqdm(range(10)):
    ros = RandomOverSampler()
    _, _ = ros.fit_resample(X, y)
    train_idx = ros.sample_indices_
    
    x_train = X[train_idx]
    y_train = y[train_idx]
    
    model = CatBoostClassifier(iterations=100, verbose=False, random_state=turn_i*10)
    model.fit(x_train, y_train)

    models.append(model)

In [12]:
df_test = pd.read_csv('Validation-dataset.csv')
df_test, _ = add_features(df_test) # features_names may have benn modified
X_pred = scaler.transform(df_test[features_names])

predictions = []
for model in models:
    predictions.append(model.predict(X_pred))
pred = mode(np.stack(predictions), 0).mode.reshape(-1)

# example of how to save a prediction
np.savetxt('prediction2.csv', pred, delimiter=',', encoding='utf-8') 