In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from pycaret.classification import *

In [2]:
hydrothermal_deposits_df = pd.read_csv("Hydrothermal_deposits.csv")
porphyry_deposits_df = pd.read_csv("Phorphy_deposits.csv")
sedex_deposits_df = pd.read_csv("Sedex _Deposits.csv")
vms_deposits_df = pd.read_csv("VMS_Dataset.csv")
epithermal_deposits_df = pd.read_csv("epithermal_Deposists.csv")

In [3]:
hydrothermal_deposits_df['Deposit_Type'] = 'Hydrothermal'
porphyry_deposits_df['Deposit_Type'] = 'Porphyry'
sedex_deposits_df['Deposit_Type'] = 'Sedex'
vms_deposits_df['Deposit_Type'] = 'VMS'
epithermal_deposits_df['Deposit_Type'] = 'Epithermal'

In [4]:
# --- 1. Load and Clean Porphyry Data ---
# This file is mostly clean but has many empty cells and blank rows at the end.
porphyry_df = pd.read_csv('Phorphy_deposits.csv')
porphyry_df.dropna(how='all', inplace=True) # Drop rows where all values are missing
porphyry_df['Deposit_Type'] = 'Porphyry'
print("Porphyry data loaded and cleaned.")


Porphyry data loaded and cleaned.


In [5]:
# --- 2. Load and Clean SEDEX Data ---
# The column names have extra spaces around them.
sedex_df = pd.read_csv('Sedex _Deposits.csv')
sedex_df.columns = sedex_df.columns.str.strip() # Remove leading/trailing spaces from headers
sedex_df['Deposit_Type'] = 'SEDEX'
print("SEDEX data loaded and cleaned.")


SEDEX data loaded and cleaned.


In [6]:
# --- 3. Load and Clean VMS Data ---
# This file has junk rows at the beginning and empty columns/rows at the end.
# We skip the first two rows which are not data.
vms_df = pd.read_csv('VMS_Dataset.csv', header=1)

# The junk row of numbers ('0.005', '0.010'...) is now the first data row (at index 0).
# We must drop it by its index.
vms_df.drop(vms_df.index[0], inplace=True)

# Now, we proceed with the other cleaning steps for this file.
vms_df = vms_df.iloc[:, :-2] # Drop the last two completely empty columns
vms_df.dropna(how='all', inplace=True) # Drop empty rows from the end
vms_df['Deposit_Type'] = 'VMS'
print("VMS data loaded and cleaned.")



VMS data loaded and cleaned.


In [7]:
# --- 4. Load and Clean Epithermal Data ---
# The first column has a very messy name. We will rename it to 'Fe'.
epithermal_df = pd.read_csv('epithermal_Deposists.csv')
# Rename the first column to 'Fe' for consistency.
epithermal_df.rename(columns={epithermal_df.columns[0]: 'Fe'}, inplace=True)
epithermal_df['Deposit_Type'] = 'Epithermal'
print("Epithermal data loaded and cleaned.")


Epithermal data loaded and cleaned.


In [8]:
hydrothermal_df = pd.read_csv('Hydrothermal_deposits.csv', header=0)
hydrothermal_df = hydrothermal_df.iloc[2:].reset_index(drop=True)
hydrothermal_df.columns = hydrothermal_df.columns.str.strip()
hydrothermal_df.replace('b.d', np.nan, inplace=True)
hydrothermal_df.drop(columns=['Analysis spots'], inplace=True)
hydrothermal_df.dropna(thresh=5, inplace=True)
hydrothermal_df['Deposit_Type'] = 'Hydrothermal'
print("Hydrothermal data loaded and cleaned.")

Hydrothermal data loaded and cleaned.


In [None]:
# --- 6. Combine all DataFrames ---
# Now, we combine them. Because they don't share all the same columns,
# pandas will create NaN values where a column from one file doesn't exist in another.
# This is expected and we will handle it in the next phase.
master_df = pd.concat([
    porphyry_df,
    sedex_df,
    vms_df,
    epithermal_df,
    hydrothermal_df
], ignore_index=True)

master_df.drop(columns=['Total'], inplace=True, errors='ignore')


In [10]:
# Convert all possible columns to numeric types. Errors will become NaN.
for col in master_df.columns:
    if col != 'Deposit_Type':
        master_df[col] = pd.to_numeric(master_df[col], errors='coerce')

print("\nAll data combined into 'master_df'.")

# --- 7. Final Inspection ---
print("\nMaster DataFrame Info (shows data types and non-null counts):")
master_df.info()

print("\nShape of the final combined DataFrame:", master_df.shape)

print("\nNumber of samples per deposit type:")
print(master_df['Deposit_Type'].value_counts())

# Save the cleaned master DataFrame to a new CSV to use in the next phases.
master_df.to_csv('master_mineral_data.csv', index=False)
print("\nCleaned master DataFrame saved to 'master_mineral_data.csv'")


All data combined into 'master_df'.

Master DataFrame Info (shows data types and non-null counts):


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 52 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   P             130 non-null    float64
 1   Ti            375 non-null    float64
 2   Cr            459 non-null    float64
 3   Co            614 non-null    float64
 4   Ni            615 non-null    float64
 5   Cu            613 non-null    float64
 6   Zn            615 non-null    float64
 7   As            610 non-null    float64
 8   Se            594 non-null    float64
 9   Rb            202 non-null    float64
 10  Sr            221 non-null    float64
 11  Mo            356 non-null    float64
 12  Ag            605 non-null    float64
 13  Sb            568 non-null    float64
 14  La            74 non-null     float64
 15  Ce            78 non-null     float64
 16  Pr            73 non-null     float64
 17  Nd            83 non-null     float64
 18  Sm            88 non-null     

In [11]:
# Load the master dataset you created in Phase 1
master_df = pd.read_csv('master_mineral_data.csv')

print("Data loaded successfully.")
print("Shape of the dataset:", master_df.shape)
master_df.head()

Data loaded successfully.
Shape of the dataset: (615, 52)


Unnamed: 0,P,Ti,Cr,Co,Ni,Cu,Zn,As,Se,Rb,...,Ba,Tl,Na,Mg,Al,Si,K,Ca,Ga,In
0,90.517,6.2105,1.1737,4116.7326,81.4529,0.2898,1.4136,257.9315,,0.5063,...,,,,,,,,,,
1,109.9606,4.9979,0.8826,2384.6766,224.7375,3.0433,2.4632,15.0667,19.4082,3.4249,...,,,,,,,,,,
2,118.3088,6.5557,0.2061,1275.486,52.7874,0.1746,1.6272,24.1259,25.6262,0.4048,...,,,,,,,,,,
3,113.1241,5.1639,1.6516,1245.6729,194.2875,0.2385,1.4484,8.9044,28.5042,0.5627,...,,,,,,,,,,
4,77.8263,5.1099,4.4611,1814.9775,175.4654,0.8483,2.0768,3.9935,11.5739,0.0165,...,,,,,,,,,,


In [12]:
# Separate 85% for training/validation and 15% for the final hold-out test set
# We use 'stratify' to ensure the proportion of each deposit type is the same in both sets
data_for_pycaret, holdout_test_set = train_test_split(
    master_df, 
    test_size=0.15, 
    random_state=42, 
    stratify=master_df['Deposit_Type']
)

print("Data for PyCaret (Training):", data_for_pycaret.shape)
print("Hold-out Test Set:", holdout_test_set.shape)

Data for PyCaret (Training): (522, 52)
Hold-out Test Set: (93, 52)


In [13]:
# Initialize the PyCaret classification environment
# session_id is a random seed for reproducibility
clf_setup = setup(
    data=data_for_pycaret, 
    target='Deposit_Type', 
    numeric_imputation='median',  # Use median to fill missing numbers
    session_id=123
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Deposit_Type
2,Target type,Multiclass
3,Target mapping,"Epithermal: 0, Hydrothermal: 1, Porphyry: 2, SEDEX: 3, VMS: 4"
4,Original data shape,"(522, 52)"
5,Transformed data shape,"(522, 52)"
6,Transformed train set shape,"(365, 52)"
7,Transformed test set shape,"(157, 52)"
8,Numeric features,51
9,Rows with missing values,100.0%


In [14]:
# This command trains and evaluates multiple models, then returns the best one.
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.131
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.113
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,248.663
catboost,CatBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,32.041
dt,Decision Tree Classifier,0.9972,0.9983,0.9972,0.9976,0.9972,0.9965,0.9966,0.025
lr,Logistic Regression,0.9945,0.0,0.9945,0.9952,0.9945,0.9931,0.9933,1.235
xgboost,Extreme Gradient Boosting,0.9917,0.9999,0.9917,0.9923,0.9916,0.9895,0.9897,0.079
gbc,Gradient Boosting Classifier,0.9892,0.0,0.9892,0.9915,0.9884,0.9863,0.9872,0.635
nb,Naive Bayes,0.9781,0.9971,0.9781,0.981,0.9779,0.9725,0.9733,0.026
knn,K Neighbors Classifier,0.9586,0.9927,0.9586,0.9654,0.9585,0.9482,0.9502,0.026


In [15]:
# Create a Random Forest model instance
rf_model = create_model('rf')

# Automatically tune the hyperparameters of the Random Forest model
tuned_rf_model = tune_model(rf_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [16]:
# Finalize the model (retrains it on the full training dataset)
final_model = finalize_model(tuned_rf_model)

# Use the final model to make predictions on the hold-out test data
predictions = predict_model(final_model, data=holdout_test_set)

# The 'predictions' DataFrame will show the original test data with new columns
# for the predicted label and confidence score. The accuracy score printed here
# is the most important one.
predictions.head()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,P,Ti,Cr,Co,Ni,Cu,Zn,As,Se,Rb,...,Mg,Al,Si,K,Ca,Ga,In,Deposit_Type,prediction_label,prediction_score
119,61.3186,5.4394,0.8626,1821.818604,89.3479,41.268799,63.079899,1.1782,18.194901,0.0641,...,,,,,,,,Porphyry,Porphyry,0.99
261,,7.11,3.18,19.610001,2.87,51.360001,8.02,8.03,222.910004,,...,,,,,,,,VMS,VMS,1.0
489,,,,0.01,0.18,0.1,0.16,0.26,0.47,,...,,,,,,,,Epithermal,Epithermal,1.0
266,,6.41,5.42,1.54,0.83,227.899994,2.82,14.44,310.119995,,...,,,,,,,,VMS,VMS,0.99
400,,,,0.01,0.21,0.11,0.18,0.28,0.33,,...,,,,,,,,Epithermal,Epithermal,1.0


In [17]:
# Save the final model pipeline to a file
save_model(final_model, 'mineral_deposit_classifier')

print("Model saved as 'mineral_deposit_classifier.pkl'")

Transformation Pipeline and Model Successfully Saved
Model saved as 'mineral_deposit_classifier.pkl'
