# <center> GRANITES CLASSIFICATION

In [None]:
#utilitários
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#Pré processamento
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

#Modelos
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

#SMOTE
from imblearn.over_sampling import SMOTE

#Avaliadores
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import compute_sample_weight

#salvar e carregar modelos treinados
import joblib

#filtrando alertas
import warnings
warnings.filterwarnings("ignore")
xgb.set_config(verbosity=0)

#parâmetros para o matplotlib
plt.rcParams.update({'font.size': 14})

In [None]:
#funções usadas no notebook

def treino_de_classificadores(classificadores, X_treino, y_treino, pesos=None):
    for classificador in classificadores:
        classificador.fit(X_treino, np.ravel(y_treino), sample_weight=pesos)
        y_previsao = cross_val_predict(classificador, X_treino, y_treino, cv=10)
        precisao = precision_score(y_treino, y_previsao, average = 'macro')
        revocacao = recall_score(y_treino, y_previsao, average = 'macro')
        pontuacao_f1 = f1_score(y_treino, y_previsao, average = 'macro')
        print(classificador.__class__.__name__)
        print(f'precisão:{precisao}, revocação:{revocacao}, F1 score: {pontuacao_f1}')
        
def teste_de_modelos(modelos, X_teste, y_teste):
    for modelo in modelos:
        previsoes_teste = modelo.predict(X_teste)
        print(modelo.__class__.__name__)
        print(classification_report(y_teste, previsoes_teste))
        print('_________________________________________________________________________________________________________')
        
def metricas_score(modelos, X_teste, y_teste, imb_treatment='No_treatment'): #imb_treatment
    metricas_modelo = {}
    for indice, modelo in enumerate(modelos):
        nome = modelo.__class__.__name__
        previsao = modelo.predict(X_teste)
        f1 = f1_score(y_teste, previsao, average='macro')
        precisao = precision_score(y_teste, previsao, average='macro')
        acuracia = accuracy_score(y_teste, previsao)
        imb_treatment = imb_treatment

        metricas_modelo[indice] = nome, f1, acuracia, precisao, imb_treatment
        
        
    return pd.DataFrame.from_dict(metricas_modelo, orient='index', columns=['Modelo', 'F1', 'Acurácia', 'Precisão', 'imb_treatment'])

## <center> LOADING AND PREPARING DATA

In [None]:
#Carregando o database tratado
database = pd.read_csv('dataset/db_bonin(2020)_update.csv', sep=';')

In [None]:
#Separando variáveis preditoras e variável alvo

database_preditores = database[['SiO2', 'TiO2', 'Al2O3','FeOt', 'MnO', 'MgO', 'CaO', 
                                'K2O', 'Na2O']]

database_alvo = database[['Group']]

## <center> EXPLORATORY DATA ANALYSIS

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15,15))
plt.subplots_adjust(wspace=0.40, hspace=0.1)

contagem = 0
for i in range(0,2):
    for j in range(0,3):
        sns.boxplot(ax=axes[i, j], data=database_preditores, x=database_alvo['Group'], y=database_preditores.columns[contagem], palette="pastel")
        contagem += 1
plt.setp(axes, xticks=[], xlabel=None);

for i in range(2,3):
    for j in range(0,3):
        sns.boxplot(ax=axes[i, j], data=database_preditores, x=database_alvo['Group'], y=database_preditores.columns[contagem], palette="pastel")
        contagem += 1
        
for i in range(0,3):
    axes[2,i].set_xticklabels(axes[2,i].get_xticklabels(), rotation=45, ha='right')

plt.setp(axes, xlabel=None);
plt.savefig('boxplot.jpeg')

# <center> PRE PROCESSING DATA

In [None]:
#Realizando um amostragem estratificada
X_train, X_test, y_train, y_test = train_test_split(database_preditores, database_alvo, test_size=0.30, stratify=database_alvo, random_state=42)

scaler = StandardScaler()
encoder = LabelEncoder()
X_train_scaled = scaler.fit_transform(X_train)
y_train_encoded = encoder.fit_transform(y_train)

#Preparing SMOTE train
smote = SMOTE()
scaler_smt = StandardScaler()
encoder_smt = LabelEncoder()

X_train_smt, y_train_smt = smote.fit_resample(X_train, y_train)
X_train_smt_scaled = scaler_smt.fit_transform(X_train_smt)
y_train_smt_encoded = encoder_smt.fit_transform(y_train_smt)

In [None]:
#Preparing SMOTE train
smote = SMOTE()
scaler_smt = StandardScaler()
encoder_smt = LabelEncoder()

X_train_smt, y_train_smt = smote.fit_resample(X_train, y_train)
X_train_smt_scaled = scaler_smt.fit_transform(X_train_smt)
y_train_smt_encoded = encoder_smt.fit_transform(y_train_smt)

# <center> TRAINING BASE MODELS

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_clf = LinearDiscriminantAnalysis(n_components=3)
lda_model = lda_clf.fit(X_train_scaled, y_train_encoded).transform(X_train_scaled)

models_base = [lda_clf]
X_test_scaled = scaler.transform(X_test)
y_test_encoded = encoder.transform(y_test)


metricas_modelos = metricas_score(models_base, X_test_scaled, y_test_encoded, 'lda')
metricas_modelos

In [None]:
LEGENDA = encoder.classes_
plt.subplots(figsize=(10,10))
sns.scatterplot(x=lda_model[:,0], y=lda_model[:,1], hue=y_train_encoded, palette="deep", alpha=0.7)
plt.legend(LEGENDA, loc='upper left')
sns.set_style("whitegrid");


# <center> TRAINING MODELS AND REALIZING HYPERPARAMETER TUNING

In [None]:
#RANDOM FOREST's

param_grid_forest = [{'bootstrap': [False],
                      'criterion': ['entropy'],
                      'n_estimators': [50, 100], 
                      'max_features': [3, 4],
                      'max_depth': [None, 5, 8, 10]}]

grid_search_forest = GridSearchCV(rnd_clf, param_grid_forest, cv = 5, scoring = 'accuracy', return_train_score = True, n_jobs=-1)
grid_search_forest.fit(X_train_scaled, np.ravel(y_train))



#ADABOOSTING

param_grid_ADA = [{'n_estimators': [100],
                   'algorithm': ['SAMME'],
                   'learning_rate': [0.70, 0.75, 0,80],
                  }]

grid_search_ada = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier()), param_grid_ADA, 
                               cv = 5, scoring = 'accuracy', return_train_score = True, verbose = 2, n_jobs=-1) 
grid_search_ada.fit(X_train_scaled, np.ravel(y_train))

#XGBoost

...

# <center> TRAINING AND TESTING THE BEST MODELS

In [None]:
#Post GridSearCV models

best_rnd_clf = RandomForestClassifier(bootstrap=False, max_features=3, criterion='entropy', n_estimators=100, random_state=42)
xgb_best = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.3,
               colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
               eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
               interaction_constraints='', learning_rate=0.2, max_delta_step=0,
               max_depth=6, min_child_weight=1,
               monotone_constraints='()', n_estimators=100, n_jobs=8,
               num_parallel_tree=5, objective='multi:softprob', predictor='auto',
               random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1.5,
               subsample=0.9, tree_method='exact', validate_parameters=1,
               verbosity=None)

best_ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10) ,algorithm='SAMME', n_estimators=100, learning_rate=0.75, random_state=42)

modelos = [best_rnd_clf, xgb_best, best_ada_clf]

In [None]:
#Treinando classificadores com pesos para as classes
pesos = compute_sample_weight("balanced", y_train)
treino_de_classificadores(modelos, X_train_scaled, y_train_encoded, pesos);

In [None]:
#scaling test variables
X_test_scaled = scaler.transform(X_test)
y_test_encoded = encoder.transform(y_test)

#testing models
metricas_modelos = metricas_score(modelos, X_test_scaled, y_test_encoded, 'wgt')

In [None]:
sns.set(style="ticks")
plt.figure(figsize = (7, 7))
plt.ylim(0.5, 1)
ax = sns.barplot(x=metricas_modelos['Modelo'], y=metricas_modelos['F1'], data=metricas_modelos, palette="pastel")
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation = 45)
ax.set_yticks([0.5,0.6,0.7,0.8,0.9,1])
ax.tick_params(axis='y', colors='black');
plt.savefig('Scores.jpg', dpi=600, format='jpg', bbox_inches = 'tight')

In [None]:
#predictions for test
random_forest_predictions = best_rnd_clf.predict(X_test_scaled)
ada_boost_predictions = best_ada_clf.predict(X_test_scaled)

#confusion matrix
conf_matrix1 = confusion_matrix(y_test_encoded, random_forest_predictions)
conf_matrix2 = confusion_matrix(y_test_encoded, ada_boost_predictions)

#Displaying confusion matrix for random forest test
fig, ax = plt.subplots(1,2,figsize=(15, 15))
ax[0].set_title("Random Forest")
ax[1].set_title("ADAboost")

cm = ConfusionMatrixDisplay(conf_matrix1,display_labels=encoder.classes_)
cm.plot(ax=ax[0], xticks_rotation=45,cmap='Blues', colorbar=False)

#Displaying confusion matrix for adaboost test
cm = ConfusionMatrixDisplay(conf_matrix2,display_labels=encoder.classes_)
cm.plot(ax=ax[1], xticks_rotation=45,cmap='YlOrBr', colorbar=False)

plt.subplots_adjust(wspace=0.40, hspace=0.1)
plt.show()



# <center> TRAINING AND TESTING THE BEST MODELS (SMOTE)

In [None]:
treino_de_classificadores(modelos, X_train_smt_scaled, y_train_smt_encoded)

In [None]:
X_test_scaled = scaler_smt.transform(X_test)
y_test_encoded = encoder_smt.transform(y_test)

#testing models
metricas_modelos_smote = metricas_score(modelos, X_test_scaled, y_test_encoded, 'SMOTE')
metricas_modelos_smote