In [1]:
from pycaret.classification import *

import pandas as pd
import numpy as np
import re

### Load the data

In [2]:
#df = pd.read_csv("./intermediate_data/.csv", encoding = 'latin')
#df = pd.read_excel("./intermediate_data/.xlsx, engine='openpyxl')
df = pd.read_csv("./intermediate_data/proof_of_concept.csv", encoding = 'latin')


In [3]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 50 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id_inv  41 non-null     object 
 1   site    41 non-null     object 
 2   xrd1    41 non-null     object 
 3   xrd2    9 non-null      object 
 4   xrd3    2 non-null      object 
 5   Mg      41 non-null     float64
 6   Al      41 non-null     float64
 7   Si      41 non-null     float64
 8   P       41 non-null     float64
 9   S       41 non-null     float64
 10  Cl      41 non-null     float64
 11  K       41 non-null     float64
 12  Ca      41 non-null     float64
 13  Sc      41 non-null     float64
 14  Ti      41 non-null     float64
 15  V       41 non-null     float64
 16  Cr      41 non-null     float64
 17  Mn      41 non-null     float64
 18  Fe      41 non-null     float64
 19  Co      41 non-null     float64
 20  Ni      41 non-null     float64
 21  Cu      41 non-null     float64
 22  Zn  

### This function load the pre-trained models, perform a prediction on the data and format the output dataframe

In [4]:
MODEL_NAMES = {
    'model1': 'strunz_classifier',
    'model2': 'classifier'
}

def prediction_function(df, model_key):
    """
    This function performs the following operations:
    * Loads the specified pre-trained model
    * Performs the probability prediction on the dataframe (df).
    * Filters the columns containing the word 'Score' for output formatting purposes
    * Selects the three highest probabilities, sorts them in descending order and presents them in a new column.
    * Finally it returns the dataframe with two new columns of results 'Label' and 'top_3 Probs'.
    """
    
    model = load_model(MODEL_NAMES[model_key])
    pred_prob = predict_model(estimator=model, data=df, raw_score=True)
    
    selected = [] 
    patron = re.compile('.*Score.*', re.IGNORECASE)
    
    for columna in pred_prob.columns:
        if patron.match(columna):
            selected.append(columna)
    
    def select_top_three_cols(row):
        top_three_cols = row.nlargest(3)
        top_three_cols_dict = dict(zip(top_three_cols.index, top_three_cols))
        return top_three_cols_dict
    
    top_three_cols = pred_prob[selected].apply(select_top_three_cols, axis=1)
    pred_prob['top_3_probs'] = top_three_cols
    
    pred_prob = pred_prob.drop(pred_prob[selected].columns, axis=1)
     
    return pred_prob

### Using the prediction function with the two models

In [5]:
# Model 1
preds = prediction_function(df, 'model1')
preds.rename(columns={'prediction_label': 'Predicted Major Class', 'top_3_probs': 'top_3_Major Class'}, inplace=True)
preds = preds.drop(['top_3_Major Class'], axis=1)


Transformation Pipeline and Model Successfully Loaded


In [11]:
# Model 2
preds2 = prediction_function(preds, 'model2')
preds2.rename(columns={'prediction_label': 'Predicted Mineral Species', 'top_3_probs': 'top_3_Major Class'}
preds2


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,id_inv,site,xrd1,xrd2,xrd3,Mg,Al,Si,P,S,...,Ta,W,Au,Hg,Tl,Pb,Th,Predicted_strunz,prediction_label,top_3_probs
0,MP-799,Can Figueres,Pectolite,Lopezite,,0.534567,39.314144,11.827674,26.687469,1.101149,...,0.000535,0.00023,0.000535,0.000535,0.000535,1e-05,0.000535,Carbonates(Nitrates),Planerite,"{'prediction_score_Planerite': 0.9533, 'predic..."
1,MP-807,Can Figueres,Petalite,,,0.520289,50.391808,6.222907,26.851988,0.662452,...,0.00052,0.000326,0.00052,0.00052,0.00052,1e-05,0.00052,"Phosphates,Arsenates,Vanadates",Aheylite,"{'prediction_score_Aheylite': 0.962, 'predicti..."
2,MP-790,Can Sadurni,Quartz,,,0.650851,4.771027,91.863289,0.930238,0.185348,...,0.000651,0.000651,0.000651,0.000651,0.000651,1e-05,0.000651,Silicates,Illite,"{'prediction_score_Illite': 0.6142, 'predictio..."
3,MP-792,Can Sadurni,Hematite,,,0.731535,19.859779,25.031332,0.822605,0.30165,...,0.000732,0.000732,0.000732,0.000732,0.000732,1e-05,0.000732,Oxides,Berlinite,"{'prediction_score_Berlinite': 0.9227, 'predic..."
4,MP-793,Can Sadurni,Fe oxide,,,0.720164,15.499458,43.52985,0.392935,0.058853,...,0.00072,0.00072,0.00072,0.00072,0.00072,8.2e-05,0.00072,Silicates,Berlinite,"{'prediction_score_Berlinite': 0.9653, 'predic..."
5,MP-796,Can Sadurni,Fe oxide,,,0.612001,9.388919,12.987021,1.192824,0.587567,...,0.000612,0.000612,0.000612,0.000612,0.000612,1e-05,0.000612,Carbonates(Nitrates),Calcite,"{'prediction_score_Calcite': 0.9658, 'predicti..."
6,MP-798,Can Sadurni,Alunite,Variscite,Quartz,0.537475,27.42293,36.912857,26.012712,3.379084,...,0.007307,0.000537,0.000537,0.000537,0.000537,1e-05,0.000537,"Phosphates,Arsenates,Vanadates",Variscite,"{'prediction_score_Variscite': 0.9697, 'predic..."
7,MP-885,Can Sadurni,Trolleite,Quartz,,0.600805,16.558767,29.152895,1.051515,14.015098,...,0.000601,0.000601,0.000601,0.000601,0.000601,1e-05,0.000601,Sulfates,Quartz,"{'prediction_score_Quartz': 0.9637, 'predictio..."
8,MP-1000,Cova Cassimanya,Antigorite,,,0.598263,51.716003,17.943424,0.584472,0.93509,...,0.000598,0.000598,0.000598,0.000598,0.000598,0.000125,0.000598,Silicates,Clinochlore,"{'prediction_score_Clinochlore': 0.7816, 'pred..."
9,MP-958,Cova de MontnÃ¡s,Braunite,Cinnabar,,0.614064,4.316564,2.808493,0.48413,0.739137,...,0.000614,0.000614,0.000614,0.000614,0.000614,1e-05,0.000614,Carbonates(Nitrates),Aragonite,"{'prediction_score_Aragonite': 0.9361, 'predic..."


#### The dataframe with predictions is stored in intermediate_data folder

In [8]:

preds2.to_csv("./intermediate_data/proof_of_concept_predictions.csv") 