## Proof-of-Concept
This Notebbok present a use case of the pre-trained model developed [here](./Model_development.ipynb).


In [None]:
from pycaret.classification import *

import pandas as pd
import numpy as np
import re

### Load the data

In [2]:
df = pd.read_csv("./intermediate_data/proof_of_concept.csv", encoding = 'latin')
#df = pd.read_excel("./DATA/MACLAS_DATASET.xlsx", sheet_name= proof_of_work, engine='openpyxl')


In [3]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 52 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  20 non-null     int64  
 1   id_inv      20 non-null     object 
 2   site        20 non-null     object 
 3   id_yac      20 non-null     object 
 4   xrd1        20 non-null     object 
 5   xrd2        6 non-null      object 
 6   xrd3        2 non-null      object 
 7   Mg          20 non-null     float64
 8   Al          20 non-null     float64
 9   Si          20 non-null     float64
 10  P           20 non-null     float64
 11  S           20 non-null     float64
 12  Cl          20 non-null     float64
 13  K           20 non-null     float64
 14  Ca          20 non-null     float64
 15  Sc          20 non-null     float64
 16  Ti          20 non-null     float64
 17  V           20 non-null     float64
 18  Cr          20 non-null     float64
 19  Mn          20 non-null     flo

### This function load the pre-trained models, perform a prediction on the data and format the output dataframe

In [3]:
MODEL_NAMES = {
    'model1': 'strunz_classifier',
    'model2': 'classifier'
}

def prediction_function(df, model_key):
    """
    This function performs the following operations:
    * Loads the specified pre-trained model
    * Performs the probability prediction on the dataframe (df).
    * Filters the columns containing the word 'Score' for output formatting purposes
    * Selects the three highest probabilities, sorts them in descending order and presents them in a new column.
    * Finally it returns the dataframe with two new columns of results 'Label' and 'top_3 Probs'.
    """
    
    model = load_model(MODEL_NAMES[model_key])
    pred_prob = predict_model(estimator=model, data=df, raw_score=True)
    
    selected = [] 
    patron = re.compile('.*Score.*', re.IGNORECASE)
    
    for columna in pred_prob.columns:
        if patron.match(columna):
            selected.append(columna)
    
    def select_top_three_cols(row):
        top_three_cols = row.nlargest(3)
        top_three_cols_dict = dict(zip(top_three_cols.index, top_three_cols))
        return top_three_cols_dict
    
    top_three_cols = pred_prob[selected].apply(select_top_three_cols, axis=1)
    pred_prob['top_3_probs'] = top_three_cols
    
    pred_prob = pred_prob.drop(pred_prob[selected].columns, axis=1)
     
    return pred_prob

### Using the prediction function with the two models

In [4]:
# Model 1
preds = prediction_function(df, 'model1')
preds.rename(columns={'prediction_label': 'Predicted_Major_Groups', 
                      'top_3_probs': 'top_3_probs_Major_Groups'}, inplace=True)
#preds = preds.drop(['top_3_probs_strunz'], axis=1) # Uncomment this line to drop top_3_probability 
                                                    #prediction column for this model 
                                                    


Transformation Pipeline and Model Successfully Loaded


In [5]:
# Model 2
preds2 = prediction_function(preds, 'model2')
preds2.head()

Transformation Pipeline and Model Successfully Loaded


Unnamed: 0.1,Unnamed: 0,id_inv,site,id_yac,xrd1,xrd2,xrd3,Mg,Al,Si,...,Ta,W,Au,Hg,Tl,Pb,Th,Predicted_strunz,prediction_label,top_3_probs
0,0,MP-799,Can Figueres,CAT01034,Pectolite,Lopezite,,0.534567,39.314144,11.827674,...,0.000535,0.00023,0.000535,0.000535,0.000535,1e-05,0.000535,Carbonates(Nitrates),Planerite,"{'prediction_score_Planerite': 0.9533, 'predic..."
1,1,MP-807,Can Figueres,CAT01034,Petalite,,,0.520289,50.391808,6.222907,...,0.00052,0.000326,0.00052,0.00052,0.00052,1e-05,0.00052,"Phosphates,Arsenates,Vanadates",Aheylite,"{'prediction_score_Aheylite': 0.962, 'predicti..."
2,2,MP-349,Can Sadurni,CAT01005,Sulphur,,,0.402388,5.578874,4.186923,...,0.000402,0.000402,0.04192,0.000402,0.000402,1e-05,0.000402,"Phosphates,Arsenates,Vanadates",Variscite,"{'prediction_score_Variscite': 0.951, 'predict..."
3,3,MP-788,Can Sadurni,CAT01005,Hydroxylapatite,,,0.548531,8.702245,5.506594,...,0.000549,0.000549,0.000549,0.000549,0.000549,1e-05,0.000549,"Phosphates,Arsenates,Vanadates",Aheylite,"{'prediction_score_Aheylite': 0.8919, 'predict..."
4,4,MP-790,Can Sadurni,CAT01005,Quartz,,,0.650851,4.771027,91.863289,...,0.000651,0.000651,0.000651,0.000651,0.000651,1e-05,0.000651,Silicates,Illite,"{'prediction_score_Illite': 0.6142, 'predictio..."


#### The dataframe with predictions is stored in intermediate_data folder

In [8]:

preds.to_csv("./intermediate_data/proof_of_work_predictions.csv") 