In [1]:
from pycaret.classification import *

import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv("./intermediate_DATA/evaluation_DATA/real_world_validation.csv")
#df = pd.read_csv("/home/dsg/META_MODEL_APP/FINAL/intermediate_DATA/MACLAS_preprocessed_dataset.csv")

In [3]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 54 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  21 non-null     int64  
 1   id_inv      21 non-null     object 
 2   site        21 non-null     object 
 3   id_yac      21 non-null     object 
 4   xrd1        21 non-null     object 
 5   xrd2        0 non-null      float64
 6   xrd3        0 non-null      float64
 7   DANA8       21 non-null     object 
 8   Strunz      21 non-null     object 
 9   Mg          21 non-null     float64
 10  Al          21 non-null     float64
 11  Si          21 non-null     float64
 12  P           21 non-null     float64
 13  S           21 non-null     float64
 14  Cl          21 non-null     float64
 15  K           21 non-null     float64
 16  Ca          21 non-null     float64
 17  Sc          21 non-null     float64
 18  Ti          21 non-null     float64
 19  V           21 non-null     flo

In [4]:
def prediction_function(df):
    
    """
    This function performs the following operations:
    * Loads the pre-trained model
    * Performs the probability prediction on the dataframe (df).
    * Filters the columns containing the word 'Score' for output formatting purposes
    * Selects the three highest probabilities, sorts them in descending order and presents them in a new column.
    * Finally it returns the dataframe with two new columns of results 'Label' and 'top_3 Probs'.
    
    """
    
    #model= load_model('deployment_XRD_Only_23features_20230707')#'deployment_last_stable_2023-05-06') #load trained model
    #model= load_model('deployment_XRD_Only_20230607')
    model = load_model('deployment_strunz_20230731')
    pred_prob = predict_model(estimator=model, data=df, raw_score=True) # predict on df
    
    #filter columns_____________
    selected = [] 
    patron = re.compile('.*Score.*', re.IGNORECASE)
    
    for columna in pred_prob.columns:
        if patron.match(columna):
            selected.append(columna)
    
    # format output column________
    def select_top_three_cols(row):
        top_three_cols = row.nlargest(3)
        top_three_cols_dict = dict(zip(top_three_cols.index, top_three_cols))
        return top_three_cols_dict
    
    top_three_cols = pred_prob[selected].apply(select_top_three_cols, axis=1)
    pred_prob['top_3_probs'] = top_three_cols
    
    pred_prob = pred_prob.drop(pred_prob[selected].columns, axis=1)
     
    return pred_prob
    

In [5]:
preds = prediction_function(df)

Transformation Pipeline and Model Successfully Loaded


In [6]:
preds.head()

Unnamed: 0.1,Unnamed: 0,id_inv,site,id_yac,xrd1,xrd2,xrd3,DANA8,Strunz,Mg,Al,Si,P,S,Cl,K,Ca,Sc,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Ge,As,Se,Br,Rb,Sr,Y,Zr,Nb,Mo,Ru,Pd,Ag,Cd,In,Sn,Sb,Te,Ba,Ta,W,Au,Hg,Tl,Pb,Th,Label,top_3_probs
0,0,MP-799,"CAN FIGUERES, COVA",CAT01034,Pectolite,,,Inosilicates,Silicates,0.534567,39.314145,11.827674,26.68747,1.101149,1.310557,0.892404,9.142787,0.005346,0.17176,0.005346,0.184029,0.005346,1.778445,0.005346,0.005346,6.586235,0.109726,0.000535,0.000535,0.043309,0.0044,0.000535,0.002127,0.170034,0.000535,0.003457,0.000535,0.000535,0.000535,0.000535,0.000535,0.007561,0.000535,0.000535,0.000535,0.000535,0.091047,0.000535,0.00023,0.000535,0.000535,0.000535,1e-05,0.000535,Silicates,"{'Score_Silicates': 0.9994, 'Score_Carbonates(..."
1,1,MP-807,"CAN FIGUERES, COVA",CAT01034,Petalite,,,Phyllosilicates,Silicates,0.520289,50.391806,6.222907,26.851988,0.662452,2.396853,0.005203,5.367243,0.005203,0.083369,0.005203,0.124179,0.005203,0.661017,0.005203,0.005203,6.293889,0.285371,0.00052,0.00052,0.00052,0.00052,0.00052,0.00052,0.008848,0.00052,0.00052,0.00052,0.003904,0.00052,0.00052,0.00052,0.00052,0.00052,0.010344,0.00052,0.00052,0.072552,0.00052,0.000326,0.00052,0.00052,0.00052,1e-05,0.00052,Silicates,"{'Score_Silicates': 0.9987, 'Score_Phosphates,..."
2,2,MP-349,"CAN SADURNI, COVA",CAT01005,Sulphur,,,no_correspondence,no_correspondence,0.402388,5.578874,4.186923,6.533375,32.651131,49.715018,0.004024,0.186846,0.004024,0.004024,0.004024,0.142085,0.004024,0.344636,0.004024,0.004024,0.000402,0.030219,0.000402,0.000402,0.000402,0.121015,0.000402,0.000402,0.000402,0.000402,0.000402,0.000402,0.000402,0.000402,0.000402,0.027343,0.000402,0.000402,0.000402,0.000402,0.000402,0.000402,0.000402,0.000402,0.04192,0.000402,0.000402,1e-05,0.000402,"Phosphates,Arsenates,Vanadates","{'Score_Phosphates,Arsenates,Vanadates': 0.970..."
3,3,MP-788,"CAN SADURNI, COVA",CAT01005,Hydroxylapatite,,,"Anhydrous Phosphates, etc. Containing Hydroxyl...","Phosphates,Arsenates,Vanadates",0.548531,8.702245,5.506594,23.838427,0.736051,2.312648,0.005485,53.597718,0.005485,0.087092,0.005485,0.005485,0.282962,0.739527,0.005485,0.005485,3.491826,0.091366,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.017842,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,1e-05,0.000549,Silicates,"{'Score_Silicates': 0.4571, 'Score_Carbonates(..."
4,4,MP-790,"CAN SADURNI, COVA",CAT01005,Quartz,,,no_correspondence,Oxides,0.650851,4.771027,91.86329,0.930238,0.185348,0.584022,0.006509,0.559617,0.006509,0.062275,0.028491,0.006509,0.006509,0.284256,0.002761,0.006509,0.000651,0.000651,0.000651,0.000651,0.000651,0.000651,0.000651,0.000914,0.000651,0.000651,0.001427,0.000651,0.000651,0.000651,0.000651,0.000651,0.000651,0.000651,0.000651,0.000651,0.000651,0.026019,0.000651,0.000651,0.000651,0.000651,0.000651,1e-05,0.000651,Oxides,"{'Score_Oxides': 0.9987, 'Score_Silicates': 0...."


In [7]:
preds.to_csv("./intermediate_DATA/evaluation_DATA/predictions_strunz_model.csv") 

**Deleted cases prediction**


In [8]:
deleted_cases = pd.read_csv('/home/dsg/META_MODEL_APP/EVALUATION/casos_borrados_XRD_only_test.csv', encoding='latin-1')

FileNotFoundError: [Errno 2] No such file or directory: '/home/dsg/META_MODEL_APP/EVALUATION/casos_borrados_XRD_only_test.csv'

In [8]:
deleted_cases_pred = prediction_function(deleted_cases)

Transformation Pipeline and Model Successfully Loaded


In [9]:
deleted_cases_pred.to_csv("./intermediate_DATA/evaluation_DATA/predictions_strunz_model.csv") 