Library imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_auc_score

import utilities as utils

Loads the datacube

In [None]:
data = utils.load_dataset()
data.head()

In [None]:
# modifies presence / absence columns to boolean - geology properties
data["Geology_Dictionary_Alkalic"] = data["Geology_Dictionary_Alkalic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Anatectic"] = data["Geology_Dictionary_Anatectic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Calcareous"] = data["Geology_Dictionary_Calcareous"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Carbonaceous"] = data["Geology_Dictionary_Carbonaceous"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Cherty"] = data["Geology_Dictionary_Cherty"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_CoarseClastic"] = data["Geology_Dictionary_CoarseClastic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Evaporitic"] = data["Geology_Dictionary_Evaporitic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Felsic"] = data["Geology_Dictionary_Felsic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_FineClastic"] = data["Geology_Dictionary_FineClastic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Gneissose"] = data["Geology_Dictionary_Gneissose"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Igneous"] = data["Geology_Dictionary_Igneous"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Intermediate"] = data["Geology_Dictionary_Intermediate"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Pegmatitic"] = data["Geology_Dictionary_Pegmatitic"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_RedBed"] = data["Geology_Dictionary_RedBed"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Schistose"] = data["Geology_Dictionary_Schistose"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_Sedimentary"] = data["Geology_Dictionary_Sedimentary"].apply(lambda x: True if x == "Present" else False)
data["Geology_Dictionary_UltramaficMafic"] = data["Geology_Dictionary_UltramaficMafic"].apply(lambda x: True if x == "Present" else False)
# modifies presence / absence columns to boolean - labels
data["Training_MVT_Deposit"] = data["Training_MVT_Deposit"].apply(lambda x: True if x == "Present" else False)
data["Training_MVT_Occurrence"] = data["Training_MVT_Occurrence"].apply(lambda x: True if x == "Present" else False)
data["Training_CD_Deposit"] = data["Training_CD_Deposit"].apply(lambda x: True if x == "Present" else False)
data["Training_CD_Occurrence"] = data["Training_CD_Occurrence"].apply(lambda x: True if x == "Present" else False)

Selects the data /labels used for MVT WOE baseline

In [None]:
cols_dict = utils.load_features_dict(type='MVT', baseline='updated')
data_filtered, cols = utils.extract_cols(data, cols_dict)

data_filtered.info()

The following function finds all the neighbors and creates a new column "MVT_Deposit".
Original paper treats neighbors of polygons with "Training_MVT_Deposit=Present" and "Training_MVT_Occurrence=Present" as mineral present, "MVT_Deposit=Present" (note: now Deposit means - Deposit, Occurrence, or their neighbor). 

In [None]:
data_filtered = utils.neighbor_deposits(data_filtered, type='MVT')

In [None]:
print(data_filtered['MVT_Deposit'].value_counts())
print(data_filtered['MVT_Deposit_wNeighbors'].value_counts())

In [None]:
labels_filtered = data_filtered['MVT_Deposit_wNeighbors']
data_filtered = data_filtered.drop(columns=['H3_Geometry', 'Training_MVT_Deposit', 'Training_MVT_Occurrence', 'MVT_Deposit', 'MVT_Deposit_wNeighbors'])
cols = cols[1:-2]

Clearly the dataset has MANY outliers, as reported in the paper

In [None]:
ax = sns.boxplot(data=data_filtered, orient="h", palette="Set2")

We can remove these outliers

In [None]:
data_filtered = utils.tukey_remove_outliers(data_filtered)
ax = sns.boxplot(data=data_filtered, orient="h", palette="Set2")

There are also many NaNs in the data, these can be "imputed" with the mean value.

In [None]:
print(data_filtered.isna().sum())

In [None]:
data_filtered = utils.impute_nans(data_filtered)
print(data_filtered.isna().sum())

Finally, it can be observed the above data is not "normalized", we should make features standard scores / z-scores

In [None]:
data_filtered = utils.normalize_df(data_filtered)
ax = sns.boxplot(data=data_filtered, orient="h", palette="Set2")
print("(note remaining outliers above were within the Tukey fences calculated over ALL the data)")

Discretizes the continuous variables in 5 bins

In [None]:
nbins = 5
for col in cols:
    if data_filtered[col].dtype != "float64": continue
    data_filtered[col] = pd.qcut(data_filtered[col], nbins)
data_filtered["target"] = labels_filtered

Forms the train / test splits

In [None]:
data_filtered["Latitude_EPSG4326"] = data["Latitude_EPSG4326"]
data_filtered["Training_MVT_Deposit"] = data["Training_MVT_Deposit"]
te_df, tr_df, _ = utils.get_spatial_cross_val_idx(data_filtered, test_set=1, split_col="Training_MVT_Deposit", nbins=36)
tr_df = tr_df.drop(columns=["Training_MVT_Deposit"])
te_df = te_df.drop(columns=["Training_MVT_Deposit"])

# test_set = 1 closest split counts in paper
print(f"Train counts: {tr_df['target'].value_counts()}")
print(f"Test counts: {te_df['target'].value_counts()}")

Computes WOE / IV for each variable

In [None]:
lst = []
IV_df = pd.DataFrame(columns=['Variable','IV'])
for col in cols:
    df, iv = utils.calculate_woe_iv(tr_df, col, 'target')
    lst.append(df)
    IV_df = pd.concat([IV_df, pd.DataFrame([{"Variable": col ,"IV": iv,}])], ignore_index=True)

Merge the WOE data into the existig input datacube

In [None]:
# shows the overall IV for all variables on MVT deposits/occurences
print(f"Information Value Overview:\n{IV_df.sort_values('IV', ascending=False)}\n\n\n")

In [None]:
# gets the WOE for all variables on MVT deposits/occurences
tr_result = tr_df
te_result = te_df
all_result = pd.concat([tr_result, te_result])
for i, col in enumerate(cols):
    col_data = lst[i]
    col_data = col_data.rename(columns={"Value":col,"WoE":f"{col}_WoE", "IV":f"{col}_IV"})
    tr_result = pd.merge(tr_result, col_data[[col,f"{col}_WoE",f"{col}_IV"]], on=col)
    te_result = pd.merge(te_result, col_data[[col,f"{col}_WoE",f"{col}_IV"]], on=col)
    all_result = pd.merge(all_result, col_data[[col,f"{col}_WoE",f"{col}_IV"]], on=col)

Combines the WOE for each comlumn

In [None]:
tr_result["WOE Total"] = tr_result.loc[:,[f"{col}_WoE" for col in cols]].sum(axis=1)
te_result["WOE Total"] = te_result.loc[:,[f"{col}_WoE" for col in cols]].sum(axis=1)
all_result["WOE Total"] = all_result.loc[:,[f"{col}_WoE" for col in cols]].sum(axis=1)

In [None]:
auc_score = roc_auc_score(tr_result["target"],tr_result["WOE Total"])
print(f"Train AUC score:{auc_score}")
auc_score = roc_auc_score(te_result["target"],te_result["WOE Total"])
print(f"Test AUC score:{auc_score}")
auc_score = roc_auc_score(all_result["target"],all_result["WOE Total"])
print(f"All AUC score:{auc_score}")

Note - results above differ depending on the test split chosen (higher or lower). Lawley'22 reported using fold 5 of 6 for test split, above we're using 2 of 6 because it had the closest number of train / test example reported in the paper. Our IV values for all variables are largely similar to the Lawley'22 Figure 11a, again not identical due to differences in particular examples chosen as test set.

Long-term plan should be to report averaged results across several splits instead - should give more consistent result that is recreatable.