# <center> GRANITES CLASSIFICATION

In [None]:
#utilitários
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Pré processamento
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict


#Modelos
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


#SMOTE
from imblearn.over_sampling import SMOTE

#Bayesian Optimization
from bayes_opt import BayesianOptimization

#Avaliadores
from sklearn.metrics import f1_score, precision_score, recall_score, \
    classification_report, accuracy_score, confusion_matrix, \
        ConfusionMatrixDisplay

from sklearn.utils import compute_sample_weight
from sklearn.model_selection import cross_val_score
import shap

#salvar e carregar modelos treinados
import joblib

#filtrando alertas
import warnings
warnings.filterwarnings("ignore")
xgb.set_config(verbosity=0)

#parâmetros para o matplotlib
plt.rcParams.update({'font.size': 14, 'axes.grid': False, 'xtick.bottom':True })
sns.set_style("ticks")

In [None]:
###################################################################################################
#funções usadas no notebook
###################################################################################################

def treino_de_classificadores(classificadores, X_treino, y_treino, pesos=None):
    '''Função de treino de classificadores e métricas'''
    for classificador in classificadores:
        classificador.fit(X_treino, np.ravel(y_treino), sample_weight=pesos)
        y_previsao = cross_val_predict(classificador, X_treino, y_treino, cv=10)
        precisao = precision_score(y_treino, y_previsao, average = 'weighted')
        revocacao = recall_score(y_treino, y_previsao, average = 'weighted')
        pontuacao_f1 = f1_score(y_treino, y_previsao, average = 'weighted')
        print(classificador.__class__.__name__)
        print(f'precisão:{precisao}, revocação:{revocacao}, F1 score: {pontuacao_f1}')
        
def teste_de_modelos(modelos, X_teste, y_teste):
    for modelo in modelos:
        previsoes_teste = modelo.predict(X_teste)
        print(modelo.__class__.__name__)
        print(classification_report(y_teste, previsoes_teste))
        print('_________________________________________________________________________________________________________')
        
def metricas_score(modelos, X_teste, y_teste, imb_treatment='No_treatment'): #imb_treatment
    '''Avialia modelos treinados com base em diferentes métricas'''
    metricas_modelo = {}
    for indice, modelo in enumerate(modelos):
        nome = modelo.__class__.__name__
        previsao = modelo.predict(X_teste)
        f1 = f1_score(y_teste, previsao, average='weighted')
        precisao = precision_score(y_teste, previsao, average='weighted')
        acuracia = accuracy_score(y_teste, previsao)
        imb_treatment = imb_treatment

        metricas_modelo[indice] = nome, f1, acuracia, precisao, imb_treatment
        
        
    return pd.DataFrame.from_dict(metricas_modelo, orient='index', columns=['Modelo', 'F1', 'Acurácia', 'Precisão', 'imb_treatment'])

## <center> LOADING AND PREPARING DATA

In [None]:
#Carregando o database tratado
database = pd.read_csv('dataset/GDB_Bonin(2020)_update.csv')

#Separando variáveis preditoras e variável alvo

database_preditores = database[['SiO2', 'TiO2', 'Al2O3','FeOt', 'MnO', 'MgO', 'CaO', 
                                'K2O', 'Na2O']]

database_alvo = database[['Group']]

## <center> EXPLORATORY DATA ANALYSIS

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15,15))
plt.subplots_adjust(wspace=0.40, hspace=0.1)

contagem = 0
for i in range(0,2):
    for j in range(0,3):
        sns.boxplot(ax=axes[i, j], data=database_preditores, x=database_alvo['Group'], 
                    y=database_preditores.columns[contagem], palette="Paired", order=['1- CPG & MPG', '2- Archaean TTG', '3- RTG', '4- ACG & ATG',
       '5- KCG', '6-PAG and syenites'])
        contagem += 1       

for i in range(0,3):
    axes[0,i].set_xticklabels("", rotation=45, ha='right')
for i in range(0,3):
    axes[1,i].set_xticklabels("", rotation=45, ha='right')
    
for i in range(2,3):
    for j in range(0,3):
        sns.boxplot(ax=axes[i, j], data=database_preditores, x=database_alvo['Group'], 
                    y=database_preditores.columns[contagem], palette="Paired", order=['1- CPG & MPG', '2- Archaean TTG', '3- RTG', '4- ACG & ATG',
       '5- KCG', '6-PAG and syenites'])
        contagem += 1        
for i in range(0,3):
    axes[2,i].set_xticklabels(axes[2,i].get_xticklabels(), rotation=45, ha='right')
    
plt.setp(axes, xlabel=None);
plt.savefig('boxplot.jpeg', bbox_inches='tight')

# <center> PRE PROCESSING DATA

In [None]:
encoder.classes_

In [None]:
#Realizando um amostragem estratificada
X_train, X_test, y_train, y_test = train_test_split(database_preditores, database_alvo, 
                                                    test_size=0.30, stratify=database_alvo, random_state=42)

scaler = StandardScaler()
encoder = LabelEncoder()
X_train_scaled = scaler.fit_transform(X_train)
y_train_encoded = encoder.fit_transform(y_train)

# <center> TRAINING BASE MODELS

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_clf = LinearDiscriminantAnalysis(n_components=3)
lda_model = lda_clf.fit(X_train_scaled, y_train_encoded).transform(X_train_scaled)

models_base = [lda_clf]
X_test_scaled = scaler.transform(X_test)
y_test_encoded = encoder.transform(y_test)


metricas_modelos = metricas_score(models_base, X_test_scaled, y_test_encoded, 'lda')
metricas_modelos

In [None]:
plt.subplots(figsize=(10,10))
sns.scatterplot(x=lda_model[:,0], y=lda_model[:,1], hue=y_train_encoded, palette="deep")
plt.legend(loc='upper left')
sns.set_style("whitegrid");


# <center> TRAINING AND TESTING THE BEST MODELS

In [None]:
# Base Models
rf = RandomForestClassifier(random_state=42)
xgb_clf = XGBClassifier(random_state=42)
lgbm = LGBMClassifier(random_state=42)

modelos_base = [rf, xgb_clf, lgbm]
treino_de_classificadores(modelos_base, X_train_scaled, y_train_encoded);

In [None]:
#Best Models

best_rf = RandomForestClassifier(max_depth=10.0, max_features=4, 
                                 min_samples_split=4, n_estimators=150, 
                                 random_state=42)

best_xgb = XGBClassifier(colsample_bylevel=0.5,
              colsample_bynode=0.5, colsample_bytree=0.999,
              enable_categorical=False, learning_rate=0.1, max_delta_step=None, max_depth=6,
              n_estimators=193, scale_pos_weight=None, subsample=0.5, verbosity=None, random_state=42)


best_lgbm = LGBMClassifier(colsample_bytree=0.5, max_depth=9,
                           n_estimators=141,subsample=0.999, random_state=42)

modelos_best = [best_rf, best_xgb, best_lgbm]
#pesos = compute_sample_weight("balanced", y_train_encoded)
treino_de_classificadores(modelos_best, X_train_scaled, y_train_encoded);

In [None]:
# Best models + SMOTE
smote = SMOTE()
scaler_smt = StandardScaler()
encoder_smt = LabelEncoder()

X_train_smt, y_train_smt = smote.fit_resample(X_train, y_train)
X_train_smt_scaled = scaler_smt.fit_transform(X_train_smt)
y_train_smt_encoded = encoder_smt.fit_transform(y_train_smt)

best_rf_sm = RandomForestClassifier(max_depth=10.0, max_features=4, min_samples_split=4, 
                                    n_estimators=100, random_state=42)

best_xgb_sm = XGBClassifier(colsample_bylevel=0.5,
              colsample_bynode=0.5, colsample_bytree=0.999,
              enable_categorical=False, learning_rate=0.1, max_delta_step=None, max_depth=6,
              n_estimators=193, scale_pos_weight=None, subsample=0.5, verbosity=None,
                           random_state=42)


best_lgbm_sm = LGBMClassifier(colsample_bytree=0.5, max_depth=9,
                           n_estimators=141,subsample=0.999, random_state=42)


modelos_best_smote = [best_rf_sm, best_xgb_sm, best_lgbm_sm]
treino_de_classificadores(modelos_best_smote, X_train_smt_scaled, y_train_smt_encoded)

# Saving base models
joblib.dump(rf,'models/RF_base.joblib')
joblib.dump(xgb_clf,'models/XGB_base.joblib')
joblib.dump(lgbm ,'models/LGBM_base.joblib')

# Saving best models
joblib.dump(best_rf,'models/RF_opt.joblib')
joblib.dump(best_xgb,'models/XGB_opt.joblib')
joblib.dump(best_lgbm ,'models/LGBM_opt.joblib')

# Saving best + SMOTE models
joblib.dump(best_rf_sm,'models/RF_smote.joblib')
joblib.dump(best_xgb_sm,'models/XGB_smote.joblib')
joblib.dump(best_lgbm_sm ,'models/LGBM_smote.joblib')

# <center> TRAINING AND TESTING THE BEST MODELS (SMOTE)

# <center> MODEL EVALUATION

In [None]:
X_test_scaled_smt, y_test_encoded_smt = scaler_smt.transform(X_test), encoder_smt.fit_transform(y_test)

In [None]:
#scaling test variables
X_test_scaled = scaler.transform(X_test)
y_test_encoded = encoder.transform(y_test)

#testing models
test_base = metricas_score(modelos_base, X_test_scaled, y_test_encoded, 'Base')
test_best = metricas_score(modelos_best, X_test_scaled, y_test_encoded, 'Best')
test_smote = metricas_score(modelos_best_smote, X_test_scaled_smt, y_test_encoded_smt, 'SMOTE')

In [None]:
metrics_all_models = pd.concat([test_base, test_best, test_smote]).reset_index(drop='index')
metrics_all_models

In [None]:
#predictions for test 
random_forest_predictions = best_rf.predict(X_test_scaled)
XGBboost_predictions = best_xgb.predict(X_test_scaled)
LGBM_predictions = best_lgbm.predict(X_test_scaled)

#confusion matrix
conf_matrix1 = confusion_matrix(y_test_encoded, random_forest_predictions)
conf_matrix2 = confusion_matrix(y_test_encoded, XGBboost_predictions)
conf_matrix3 = confusion_matrix(y_test_encoded, LGBM_predictions)

#Displaying confusion matrix for random forest test
fig, ax = plt.subplots(1,3,figsize=(15, 10))
ax[0].set_title("Random Forest")
ax[1].set_title("XGBoost")
ax[2].set_title("LGBM")
ax[0].grid(False)
ax[1].grid(False)
ax[2].grid(False)

cm = ConfusionMatrixDisplay(conf_matrix1,display_labels=encoder.classes_)
cm.plot(ax=ax[0], xticks_rotation=270,cmap='Blues', colorbar=False)
plt.setp(ax[0],xlabel=None)

# Displaying confusion matrix for XGBoost test
cm = ConfusionMatrixDisplay(conf_matrix2,display_labels=encoder.classes_)
cm.plot(ax=ax[1], xticks_rotation=270,cmap='YlOrBr', colorbar=False)
plt.setp(ax[1], yticks=[], ylabel=None)

# Displaying confusion matrix for ADABoost test
cm = ConfusionMatrixDisplay(conf_matrix3,display_labels=encoder.classes_)
cm.plot(ax=ax[2], xticks_rotation=270,cmap='YlGn', colorbar=False)
plt.setp(ax[2], yticks=[], ylabel=None ,xlabel=None)

plt.subplots_adjust(wspace=0.250, hspace=0.1)
#plt.show()
plt.savefig('confusion_matrix_best_models.jpeg', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(1,3,figsize=(15, 5))
ax[0].grid(False)
ax[1].grid(False)
ax[2].grid(False)

ax[0] = sns.barplot(ax=ax[0], x=test_base['Modelo'], y=test_base['F1'], data=test_base, palette="Paired")
for container in ax[0].containers:
    ax[0].bar_label(container)
ax[0].tick_params(axis='x', labelrotation=45)
ax[0].set_yticks([0.6,0.7,0.8,0.9,1])
ax[0].tick_params(axis='y', colors='black');
ax[0].set_ylim(0.7,1)

ax[1] = sns.barplot(ax=ax[1], x=test_best['Modelo'], y=test_best['F1'], data=test_best, palette="Paired")
for container in ax[1].containers:
    ax[1].bar_label(container)
ax[1].tick_params(axis='x', labelrotation=45)
ax[1].set_yticks([0.7,0.8,0.9,1])
ax[1].tick_params(axis='y', colors='black');
ax[1].set_ylim(0.7, 1)

ax[2] = sns.barplot(ax=ax[2], x=test_smote['Modelo'], y=test_smote['F1'], data=test_smote, palette="Paired")
for container in ax[2].containers:
    ax[2].bar_label(container)
ax[2].tick_params(axis='x', labelrotation=45)
ax[2].set_yticks([0.7,0.8,0.9,1])
ax[2].tick_params(axis='y', colors='black');
ax[2].set_ylim(0.7, 1)

ax[0].set_title("Base Models")
ax[1].set_title("Opt. Models")
ax[2].set_title("Opt. Models + SMOTE")
plt.subplots_adjust(wspace=0.30, hspace=0.1)
plt.savefig('f1_scores.jpeg', bbox_inches='tight')

In [None]:
explainer_rf = shap.TreeExplainer(best_rf)
shap_values_rf = explainer_xgb.shap_values(X_test_scaled)

shap.summary_plot(shap_values_rf, X_train_scaled, feature_names=X_train.columns,
                  class_names=encoder.classes_,class_inds='original',
                  color=plt.get_cmap("Paired"), show=False)
plt.title("SHAP Random Forest")
plt.savefig('shap_rf.jpeg', bbox_inches='tight')

In [None]:
explainer_xgb = shap.TreeExplainer(best_xgb)
shap_values_xgb = explainer_rf.shap_values(X_test_scaled)

shap.summary_plot(shap_values_xgb, X_train_scaled, feature_names=X_train.columns,
                  class_names=encoder.classes_,class_inds='original',
                  color=plt.get_cmap("Paired"), show=False)
plt.title("SHAP XGBoost")
plt.savefig('shap_xgb.jpeg', bbox_inches='tight')

In [None]:
explainer_lgbm = shap.TreeExplainer(best_lgbm)
shap_values_lgbm = explainer_lgbm.shap_values(X_test_scaled)

shap.summary_plot(shap_values_lgbm, X_train_scaled, feature_names=X_train.columns,
                  class_names=encoder.classes_,class_inds='original',
                  color=plt.get_cmap("Paired"), show=False)
plt.tight_layout()
plt.title("SHAP LGBM")
plt.savefig('shap_lgbm.jpeg', bbox_inches='tight')