Library imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split

import utilities as utils

Loads the datacube

In [None]:
data = utils.load_dataset()
data.head()

In [None]:
# modifies presence / absence columns to boolean - geology properties
data["Geology_Dictionary_Alkalic"] = data["Geology_Dictionary_Alkalic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Anatectic"] = data["Geology_Dictionary_Anatectic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Calcareous"] = data["Geology_Dictionary_Calcareous"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Carbonaceous"] = data["Geology_Dictionary_Carbonaceous"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Cherty"] = data["Geology_Dictionary_Cherty"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_CoarseClastic"] = data["Geology_Dictionary_CoarseClastic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Evaporitic"] = data["Geology_Dictionary_Evaporitic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Felsic"] = data["Geology_Dictionary_Felsic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_FineClastic"] = data["Geology_Dictionary_FineClastic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Gneissose"] = data["Geology_Dictionary_Gneissose"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Igneous"] = data["Geology_Dictionary_Igneous"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Intermediate"] = data["Geology_Dictionary_Intermediate"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Pegmatitic"] = data["Geology_Dictionary_Pegmatitic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_RedBed"] = data["Geology_Dictionary_RedBed"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Schistose"] = data["Geology_Dictionary_Schistose"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Sedimentary"] = data["Geology_Dictionary_Sedimentary"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_UltramaficMafic"] = data["Geology_Dictionary_UltramaficMafic"].apply(lambda x: True if x == "Present" else False)
# modifies presence / absence columns to boolean - labels
data["Training_MVT_Deposit"] = data["Training_MVT_Deposit"].apply(lambda x: True if x == "Present" else False)
data["Training_MVT_Occurrence"] = data["Training_MVT_Occurrence"].apply(lambda x: True if x == "Present" else False)
data["Training_CD_Deposit"] = data["Training_CD_Deposit"].apply(lambda x: True if x == "Present" else False)
data["Training_CD_Occurrence"] = data["Training_CD_Occurrence"].apply(lambda x: True if x == "Present" else False)

Selects the data /labels used for CD WOE baseline

In [None]:
cols_dict = utils.load_features_dict(type='CD', baseline='preferred')
data_filtered, cols = utils.extract_cols(data, cols_dict)

# converts the categorical variables to one-hot encoded vectors for ML compatibility
data_filtered = pd.get_dummies(data_filtered, columns=['Geology_Lithology_Majority','Geology_Lithology_Minority','Geology_Period_Maximum_Majority','Geology_Period_Minimum_Majority'], prefix=['Geology_Lithology_Majority','Geology_Lithology_Minority','Geology_Period_Maximum_Majority','Geology_Period_Minimum_Majority'])

data_filtered.info()

The following function finds all the neighbors and creates a new column "CD_Deposit".
Original paper treats neighbors of polygons with "Training_CD_Deposit=Present" and "Training_CD_Occurrence=Present" as mineral present, "CD_Deposit=Present" (note: now Deposit means - Deposit, Occurrence, or their neighbor). 

In [None]:
data_filtered = utils.neighbor_deposits(data_filtered, type='CD')

In [None]:
print(data_filtered['CD_Deposit'].value_counts())
print(data_filtered['CD_Deposit_wNeighbors'].value_counts())

In [None]:
labels_filtered = data_filtered['CD_Deposit_wNeighbors']
data_filtered = data_filtered.drop(columns=['H3_Geometry', 'Training_CD_Deposit', 'Training_CD_Occurrence', 'CD_Deposit', 'CD_Deposit_wNeighbors'])
cols = cols[1:-2]

Clearly the dataset has MANY outliers, as reported in the paper

In [None]:
ax = sns.boxplot(data=data_filtered, orient="h", palette="Set2")

We can remove these outliers

In [None]:
data_filtered = utils.tukey_remove_outliers(data_filtered)
ax = sns.boxplot(data=data_filtered, orient="h", palette="Set2")

There are also many NaNs in the data, these can be "imputed" with the mean value.

In [None]:
print(data_filtered.isna().sum())

In [None]:
data_filtered = utils.impute_nans(data_filtered)
print(data_filtered.isna().sum())

Finally, it can be observed the above data is not "normalized", we should make features standard scores / z-scores

In [None]:
data_filtered = utils.normalize_df(data_filtered)
ax = sns.boxplot(data=data_filtered, orient="h", palette="Set2")
print("(note remaining outliers above were within the Tukey fences calculated over ALL the data)")

Forms the train / test splits

In [None]:
data_filtered["target"] = labels_filtered
data_filtered["Latitude_EPSG4326"] = data["Latitude_EPSG4326"]
data_filtered["Training_CD_Deposit"] = data["Training_CD_Deposit"]
te_df, tr_df, _ = utils.get_spatial_cross_val_idx(data_filtered, test_set=0)
tr_df = tr_df.drop(columns=["Training_CD_Deposit"])
te_df = te_df.drop(columns=["Training_CD_Deposit"])

# test_set = 1 closest split counts in paper
print(f"Train counts: {tr_df['target'].value_counts()}")
print(f"Test counts: {te_df['target'].value_counts()}")

In [None]:
hist_gbm_classifier = HistGradientBoostingClassifier(
    learning_rate=0.3,
    max_iter=70,              # Number of boosting iterations (equivalent to n_estimators)
    max_depth=6,              # Maximum tree depth
    min_samples_leaf=48,      # Minimum samples required for a leaf node
    max_leaf_nodes=64,        # Maximum number of leaf nodes
    verbose=1                 # Show progress bars
)

In [None]:
gain = 500
hist_gbm_classifier.fit(tr_df.drop(columns=['target','Latitude_EPSG4326','group']), tr_df['target'], sample_weight=gain*tr_df['target'].astype('int')+1)

In [None]:
y_pred = hist_gbm_classifier.predict(tr_df.drop(columns=['target','Latitude_EPSG4326','group']))
auc_score = roc_auc_score(tr_df["target"], y_pred)
print(f"Train AUC score:{auc_score}")

y_pred = hist_gbm_classifier.predict(te_df.drop(columns=['target','Latitude_EPSG4326','group']))
auc_score = roc_auc_score(te_df['target'], y_pred)
print(f"Test AUC score:{auc_score}")

all_df = pd.concat([tr_df, te_df])
y_pred = hist_gbm_classifier.predict(all_df.drop(columns=['target','Latitude_EPSG4326','group']))
auc_score = roc_auc_score(all_df["target"], y_pred)
print(f"All AUC score:{auc_score}")