In [1]:
from pycaret.classification import *

import pandas as pd
import numpy as np
import re

### Load the data

In [2]:
#df = pd.read_csv("./intermediate_data/.csv", encoding = 'latin')
#df = pd.read_excel("./intermediate_data/.xlsx, engine='openpyxl')
df = pd.read_csv("./intermediate_data/proof_of_concept.csv", encoding = 'latin')


In [3]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 50 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id_inv         40 non-null     object 
 1   Mg             40 non-null     float64
 2   Al             40 non-null     float64
 3   Si             40 non-null     float64
 4   P              40 non-null     float64
 5   S              40 non-null     float64
 6   Cl             40 non-null     float64
 7   K              40 non-null     float64
 8   Ca             40 non-null     float64
 9   Sc             40 non-null     float64
 10  Ti             40 non-null     float64
 11  V              40 non-null     float64
 12  Cr             40 non-null     float64
 13  Mn             40 non-null     float64
 14  Fe             40 non-null     float64
 15  Co             40 non-null     float64
 16  Ni             40 non-null     float64
 17  Cu             40 non-null     float64
 18  Zn          

#### This function load the pre-trained models, perform a prediction on the data and format the output dataframe

In [4]:
MODEL_NAMES = {
    'model1': 'strunz_classifier',
    'model2': 'classifier'
}

def prediction_function(df, model_key):
    """
    This function performs the following operations:
    * Loads the specified pre-trained model
    * Performs the probability prediction on the dataframe (df).
    * Filters the columns containing the word 'Score' for output formatting purposes
    * Selects the three highest probabilities, sorts them in descending order and presents them in a new column.
    * Finally it returns the dataframe with two new columns of results 'Label' and 'top_3 Probs'.
    """
    
    model = load_model(MODEL_NAMES[model_key])
    pred_prob = predict_model(estimator=model, data=df, raw_score=True)
    
    selected = [] 
    patron = re.compile('.*Score.*', re.IGNORECASE)
    
    for columna in pred_prob.columns:
        if patron.match(columna):
            selected.append(columna)
    
    def select_top_three_cols(row):
        top_three_cols = row.nlargest(3)
        top_three_cols_dict = dict(zip(top_three_cols.index, top_three_cols))
        return top_three_cols_dict
    
    top_three_cols = pred_prob[selected].apply(select_top_three_cols, axis=1)
    pred_prob['top_3_probs'] = top_three_cols
    
    pred_prob = pred_prob.drop(pred_prob[selected].columns, axis=1)
     
    return pred_prob

### Using the prediction function with the two models

In [5]:
# Model 1
preds = prediction_function(df, 'model1')
preds.rename(columns={'prediction_label': 'Predicted Major Class', 'top_3_probs': 'top_3_Major Class'}, inplace=True)
preds = preds.drop(['top_3_Major Class'], axis=1)


Transformation Pipeline and Model Successfully Loaded


In [7]:
# Model 2
preds2 = prediction_function(preds, 'model2')
#preds2.rename(columns={'prediction_label': 'Predicted Mineral Species', 'top_3_probs': 'top_3_Major Class'}
preds2

Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,id_inv,Mg,Al,Si,P,S,Cl,K,Ca,Sc,...,Tl,Pb,Th,xrd1,xrd2,xrd3,posible nueva,Predicted Major Class,prediction_label,top_3_probs
0,MP-700,0.021383,42.869385,12.344677,28.192818,0.519786,6.801766,1.001314,4.038676,0.000116,...,0.002339,3e-06,2e-06,Perovskite,Spinel,FeO,Variscita?,"Phosphates,Arsenates,Vanadates",Variscite,"{'prediction_score_Variscite': 0.9707, 'predic..."
1,MP-799,0.534567,39.314144,11.827674,26.687469,1.101149,1.310557,0.892404,9.142787,0.005346,...,0.000535,1e-05,0.000535,Pectolite,Lopezite,,Turquoise/Planerite Cu,Carbonates(Nitrates),Planerite,"{'prediction_score_Planerite': 0.9533, 'predic..."
2,MP-807,0.520289,50.391808,6.222907,26.851988,0.662452,2.396853,0.005203,5.367243,0.005203,...,0.00052,1e-05,0.00052,Petalite,,,No determined,"Phosphates,Arsenates,Vanadates",Aheylite,"{'prediction_score_Aheylite': 0.962, 'predicti..."
3,MP-790,0.650851,4.771027,91.863289,0.930238,0.185348,0.584022,0.006509,0.559617,0.006509,...,0.000651,1e-05,0.000651,Quartz,,,,Silicates,Illite,"{'prediction_score_Illite': 0.6142, 'predictio..."
4,MP-792,0.731535,19.859779,25.031332,0.822605,0.30165,2.50414,1.814898,1.83988,0.007315,...,0.000732,1e-05,0.000732,Hematite,,,OxFE/Quartz,Oxides,Berlinite,"{'prediction_score_Berlinite': 0.9227, 'predic..."
5,MP-793,0.720164,15.499458,43.52985,0.392935,0.058853,0.84957,1.789634,0.717145,0.007202,...,0.00072,8.2e-05,0.00072,Fe oxide,,,,Silicates,Berlinite,"{'prediction_score_Berlinite': 0.9653, 'predic..."
6,MP-796,0.612001,9.388919,12.987021,1.192824,0.587567,1.88745,1.255997,70.84436,0.00612,...,0.000612,1e-05,0.000612,Fe oxide,,,No determined,Carbonates(Nitrates),Calcite,"{'prediction_score_Calcite': 0.9658, 'predicti..."
7,MP-798,0.537475,27.42293,36.912857,26.012712,3.379084,1.356347,1.942432,1.084257,0.005375,...,0.000537,1e-05,0.000537,Alunite,Variscite,Quartz,,"Phosphates,Arsenates,Vanadates",Variscite,"{'prediction_score_Variscite': 0.9697, 'predic..."
8,MP-885,0.600805,16.558767,29.152895,1.051515,14.015098,2.870984,4.120258,20.448328,0.006008,...,0.000601,1e-05,0.000601,Trolleite,Quartz,,Iron Phosphate,Sulfates,Quartz,"{'prediction_score_Quartz': 0.9637, 'predictio..."
9,MP-1000,0.598263,51.716003,17.943424,0.584472,0.93509,3.225559,0.005983,19.582903,0.005983,...,0.000598,0.000125,0.000598,Antigorite,,,???,Silicates,Clinochlore,"{'prediction_score_Clinochlore': 0.7816, 'pred..."


#### The dataframe with predictions is stored in intermediate_data folder

In [8]:

preds2.to_csv("./intermediate_data/proof_of_concept_predictions.csv") 