Library imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

import utilities as utils



Loads the datacube

In [None]:
data = utils.load_dataset()
data.head()

Unnamed: 0,"ï»¿""H3_Address""",H3_Resolution,H3_Geometry,Longitude_EPSG4326,Latitude_EPSG4326,Continent_Majority,Continent_Minority,Country_Majority,Country_Minority,Province_Majority,...,Litmod_Density_Asthenosphere,Litmod_Density_Crust,Litmod_Density_Lithosphere,Crust1_Type,Crust1_CrustalThickness,Crust1_SedimentThickness,Training_MVT_Deposit,Training_MVT_Occurrence,Training_CD_Deposit,Training_CD_Occurrence
0,8712e579bffffff,7,"POLYGON ((-115.0314 54.5077, -115.0393 54.4961...",-115.018142,54.497221,North America,North America,Canada,Canada,Alberta,...,3480.580078,2891.260254,3337.300049,island arc,-38.450497,2991.459961,Absent,Absent,Absent,Absent
1,8712e579affffff,7,"POLYGON ((-115.0658 54.51706, -115.0737 54.505...",-115.052542,54.50659,North America,North America,Canada,Canada,Alberta,...,3480.580078,2891.26001,3337.300293,island arc,-38.43,3000.000244,Absent,Absent,Absent,Absent
2,8712e56b4ffffff,7,"POLYGON ((-115.0604 54.49501, -115.0682 54.483...",-115.047107,54.484541,North America,North America,Canada,Canada,Alberta,...,3480.580078,2891.259766,3337.300049,island arc,-38.43,3000.0,Absent,Absent,Absent,Absent
3,8712e56b5ffffff,7,"POLYGON ((-115.026 54.48564, -115.0338 54.4740...",-115.012729,54.475169,North America,North America,Canada,Canada,Alberta,...,3480.580078,2891.26001,3337.300049,island arc,-38.591599,2932.666504,Absent,Absent,Absent,Absent
4,8712e56a6ffffff,7,"POLYGON ((-114.997 54.49832, -115.0049 54.4867...",-114.983753,54.48784,North America,North America,Canada,Canada,Alberta,...,3480.580078,2891.26001,3337.300049,island arc,-39.815273,2422.801758,Absent,Absent,Absent,Absent


In [None]:
# modifies presence / absence columns to boolean - geology properties
data["Geology_Dictionary_Alkalic"] = data["Geology_Dictionary_Alkalic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Anatectic"] = data["Geology_Dictionary_Anatectic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Calcareous"] = data["Geology_Dictionary_Calcareous"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Carbonaceous"] = data["Geology_Dictionary_Carbonaceous"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Cherty"] = data["Geology_Dictionary_Cherty"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_CoarseClastic"] = data["Geology_Dictionary_CoarseClastic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Evaporitic"] = data["Geology_Dictionary_Evaporitic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Felsic"] = data["Geology_Dictionary_Felsic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_FineClastic"] = data["Geology_Dictionary_FineClastic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Gneissose"] = data["Geology_Dictionary_Gneissose"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Igneous"] = data["Geology_Dictionary_Igneous"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Intermediate"] = data["Geology_Dictionary_Intermediate"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Pegmatitic"] = data["Geology_Dictionary_Pegmatitic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_RedBed"] = data["Geology_Dictionary_RedBed"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Schistose"] = data["Geology_Dictionary_Schistose"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Sedimentary"] = data["Geology_Dictionary_Sedimentary"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_UltramaficMafic"] = data["Geology_Dictionary_UltramaficMafic"].apply(lambda x: True if x == "Present" else False)
# modifies presence / absence columns to boolean - labels
data["Training_MVT_Deposit"] = data["Training_MVT_Deposit"].apply(lambda x: True if x == "Present" else False)
data["Training_MVT_Occurrence"] = data["Training_MVT_Occurrence"].apply(lambda x: True if x == "Present" else False)
data["Training_CD_Deposit"] = data["Training_CD_Deposit"].apply(lambda x: True if x == "Present" else False)
data["Training_CD_Occurrence"] = data["Training_CD_Occurrence"].apply(lambda x: True if x == "Present" else False)

Selects the data /labels used for MVT WOE baseline

In [None]:
cols_dict = utils.load_features_dict(type='MVT', baseline='preferred')
data_filtered, cols = utils.extract_cols(data, cols_dict)

data_filtered.info()

The following function finds all the neighbors and creates a new column "MVT_Deposit".
Original paper treats neighbors of polygons with "Training_MVT_Deposit=Present" and "Training_MVT_Occurrence=Present" as mineral present, "MVT_Deposit=Present" (note: now Deposit means - Deposit, Occurrence, or their neighbor). 

In [None]:
data_filtered = utils.neighbor_deposits(data_filtered, type='MVT')

In [None]:
print(data_filtered['MVT_Deposit'].value_counts())
print(data_filtered['MVT_Deposit_wNeighbors'].value_counts())

In [None]:
labels_filtered = data_filtered['MVT_Deposit_wNeighbors']
data_filtered = data_filtered.drop(columns=['H3_Geometry', 'Training_MVT_Deposit', 'Training_MVT_Occurrence', 'MVT_Deposit', 'MVT_Deposit_wNeighbors'])
cols = cols[1:-2]

Clearly the dataset has MANY outliers, as reported in the paper

In [None]:
# ax = sns.boxplot(data=data_filtered, orient="h", palette="Set2")

In [None]:
data_filtered = pd.get_dummies(data_filtered, columns=['Geology_Lithology_Majority','Geology_Lithology_Minority','Geology_Period_Maximum_Majority','Geology_Period_Minimum_Majority'], prefix=['Geology_Lithology_Majority','Geology_Lithology_Minority','Geology_Period_Maximum_Majority','Geology_Period_Minimum_Majority'])
data_filtered.info()

We can remove these outliers

In [None]:
data_filtered = utils.tukey_remove_outliers(data_filtered)
# ax = sns.boxplot(data=data_filtered, orient="h", palette="Set2")

There are also many NaNs in the data, these can be "imputed" with the mean value.

In [None]:
print(data_filtered.isna().sum())

In [None]:
data_filtered = utils.impute_nans(data_filtered)
print(data_filtered.isna().sum())

Finally, it can be observed the above data is not "normalized", we should make features standard scores / z-scores

In [None]:
data_filtered = utils.normalize_df(data_filtered)
ax = sns.boxplot(data=data_filtered, orient="h", palette="Set2")
print("(note remaining outliers above were within the Tukey fences calculated over ALL the data)")

Appending 'target' column to data_filtered

In [None]:
data_filtered["target"] = labels_filtered

df_result = data_filtered

Index(['Sedimentary_Dictionary', 'Igneous_Dictionary',
       'Metamorphic_Dictionary', 'Seismic_LAB_Priestley', 'Seismic_Moho',
       'Gravity_GOCE_ShapeIndex', 'Geology_Paleolatitude_Period_Minimum',
       'Terrane_Proximity', 'Geology_PassiveMargin_Proximity',
       'Geology_BlackShale_Proximity', 'Geology_Fault_Proximity',
       'Gravity_Bouguer', 'Gravity_Bouguer_HGM',
       'Gravity_Bouguer_UpCont30km_HGM', 'Gravity_Bouguer_HGM_Worms_Proximity',
       'Gravity_Bouguer_UpCont30km_HGM_Worms_Proximity', 'Magnetic_HGM',
       'Magnetic_LongWavelength_HGM', 'Magnetic_HGM_Worms_Proximity',
       'Magnetic_LongWavelength_HGM_Worms_Proximity',
       'Geology_Lithology_Majority_Igneous_Extrusive',
       'Geology_Lithology_Majority_Igneous_Intrusive_Felsic',
       'Geology_Lithology_Majority_Igneous_Intrusive_Mafic',
       'Geology_Lithology_Majority_Metamorphic_Gneiss',
       'Geology_Lithology_Majority_Metamorphic_Gneiss_Paragneiss',
       'Geology_Lithology_Majority_Metamo

Adds the latitudes to the datacube to make train, validation, and test splits

In [None]:
df_result["Latitude_EPSG4326"] = data["Latitude_EPSG4326"]
te_df, tr_df, splits = utils.get_spatial_cross_val_idx(df_result)

In [None]:
for i, (train_index, val_index) in enumerate(splits):
    print(f"Fold {i}:")
    print(f"  Train: groups={np.unique(tr_df.iloc[train_index.tolist()]['group'].tolist())}")
    print(f"  Val: groups={np.unique(tr_df.iloc[val_index.tolist()]['group'].tolist())}")

In [None]:
hist_gbm_classifier = HistGradientBoostingClassifier(
    learning_rate=0.3,
    max_iter=90,              # Number of boosting iterations (equivalent to n_estimators)
    max_depth=6,              # Maximum tree depth
    min_samples_leaf=48,      # Minimum samples required for a leaf node
    max_leaf_nodes=64,        # Maximum number of leaf nodes
    verbose=1                 # Show progress bars
)

In [None]:
gain = 650
hist_gbm_classifier.fit(tr_df.drop(columns=['target','Latitude_EPSG4326','group']), tr_df['target'], sample_weight=gain*tr_df['target'].astype('int')+1)

In [None]:
y_pred = hist_gbm_classifier.predict(tr_df.drop(columns=['target','Latitude_EPSG4326','group']))
auc_score = roc_auc_score(tr_df["target"], y_pred)
print(f"Train AUC score:{auc_score}")

y_pred = hist_gbm_classifier.predict(te_df.drop(columns=['target','Latitude_EPSG4326','group']))
auc_score = roc_auc_score(te_df['target'], y_pred)
print(f"Test AUC score:{auc_score}")

y_pred = hist_gbm_classifier.predict(df_result.drop(columns=['target','Latitude_EPSG4326']))
auc_score = roc_auc_score(df_result["target"], y_pred)
print(f"All AUC score:{auc_score}")