In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from sklearn.inspection import permutation_importance

### Carrega dataset com as features

In [None]:
clean_dataset = pd.read_csv("../1_enrich_results/queries_train_judged_expanded_enriched.csv", sep='\t')
clean_dataset = clean_dataset[["label", "relevant_count","spearman" , "words_similarity",  "expansion_idf", "expansion_idf_difference"]] 
clean_dataset = clean_dataset.rename(columns={
                                                'words_similarity': 'words_semantic_similarity',
                                                'relevant_count': 'k_relevance_judgments',
                                                'spearman':'spearman_rank_correlation'
                                            })

clean_dataset

### Faz o k-fold evaluation usando XGBoost 

In [None]:
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

accuracy_scores = []
precision_scores = []
recall_scores = []
auc_scores = []
f1_micro_scores = []
f1_macro_scores = []

cumulative_cm = np.zeros((2, 2))

all_true_labels = []
all_predictions = []

fig, axs = plt.subplots(1, n_folds, figsize=(20, 5))
fig.suptitle("Matriz de confusão para cada fold")

# percorre os folds
last_precison_score = 0
for i, (train_index, test_index) in enumerate(kf.split(clean_dataset)):
    X_train, X_test = clean_dataset.drop('label', axis=1).iloc[train_index], clean_dataset.drop('label', axis=1).iloc[test_index]
    y_train, y_test = clean_dataset['label'].iloc[train_index], clean_dataset['label'].iloc[test_index]
    print(f"Fold {i} -> Tamanho X_train: {len(X_train)}, Tamanho X_test: {len(X_test)}, Tamanho y_train: {len(y_train)}, Tamanho y_test: {len(y_test)}")
    
    initial_model = xgb.XGBClassifier(
                                        eval_metric='logloss', 
                                        n_estimators=500, 
                                        max_depth=5,
                                        learning_rate=0.5
                                    )
    initial_model.fit(X_train, y_train)
    
    y_pred = initial_model.predict(X_test)
    y_pred_proba = initial_model.predict_proba(X_test)[:, 1]
    
    all_true_labels.extend(y_test)
    all_predictions.extend(y_pred)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_pred_proba))
    f1_micro_scores.append(f1_score(y_test, y_pred, average='micro'))
    f1_macro_scores.append(f1_score(y_test, y_pred, average='macro'))
    
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axs[i])
    axs[i].set_title(f'Fold {i+1}')
    axs[i].set_xlabel('Predição do modelo')
    axs[i].set_ylabel('Label real')
    
    cumulative_cm += cm

    # Salvar o modelo caso a precisão tenha sido maior que a anteior
    if precision_score(y_test, y_pred) >= last_precison_score:
        initial_model.save_model(f"xgboost_model.json")
        print(f"modelo {i} salvo")

    last_precison_score = precision_score(y_test, y_pred)

#Calcula a matriz de confusão
mean_cm = cumulative_cm / n_folds

plt.figure(figsize=(8, 6))
sns.heatmap(mean_cm, annot=True, fmt=".2f", cmap='Blues')
plt.title("Média da matriz de confusão para todos os folds")
plt.xlabel("Predição do modelo")
plt.ylabel("Rótulo real")
plt.show()

print(f"Acurácia: {sum(accuracy_scores) / n_folds}")
print(f"Precisão: {sum(precision_scores) / n_folds}")
print(f"Recall: {sum(recall_scores) / n_folds}")
#print(f"AUC: {sum(auc_scores) / n_folds}")
print(f"F1 Micro: {sum(f1_micro_scores) / n_folds}")
print(f"F1 Macro: {sum(f1_macro_scores) / n_folds}")



In [None]:
results = permutation_importance(initial_model, X_test, y_test, n_repeats=10, random_state=42, scoring='accuracy')
sorted_idx = np.argsort(results.importances_mean)

plt.barh([X_test.columns[i] for i in sorted_idx], [results.importances_mean[i] for i in sorted_idx], height=0.4)
plt.xlabel('Contribuição de cada feature para o modelo')
plt.title('Permutação de features')
plt.show()

In [None]:
# Relevância de cada métrica +/- desvio padrão
for i in results.importances_mean.argsort()[::-1]:
     if results.importances_mean[i] - 2 * results.importances_std[i] > 0:
         print(f"{initial_model.feature_names_in_[i]:<8}"
               f"\t\t {results.importances_mean[i]:.3f}"
               f" +/- {results.importances_std[i]:.3f}")

### Avaliação de falsos positivos e negativos

In [None]:
# Converte os rótulos e previsões em um dataframe de análise
results_df = pd.DataFrame({
    'True Labels': all_true_labels,
    'Predictions': all_predictions
})

# Identifica as categorias
false_positives = results_df[(results_df['True Labels'] == 0) & (results_df['Predictions'] == 1)]
false_negatives = results_df[(results_df['True Labels'] == 1) & (results_df['Predictions'] == 0)]
true_positives = results_df[(results_df['True Labels'] == 1) & (results_df['Predictions'] == 1)]
true_negatives = results_df[(results_df['True Labels'] == 0) & (results_df['Predictions'] == 0)]

# Combina com o dataset original para recuperar as features
X_full = clean_dataset.drop('label', axis=1) 

false_positive_indices = false_positives.index
fp_feature_values = X_full.iloc[false_positive_indices]
false_negative_indices = false_negatives.index
fn_feature_values = X_full.iloc[false_negative_indices]
true_positive_indices = true_positives.index
tp_feature_values = X_full.iloc[true_positive_indices]
true_negative_indices = true_negatives.index
tn_feature_values = X_full.iloc[true_negative_indices]

# Plota as comparações
fig, axs = plt.subplots(2, 2, figsize=(16, 12))

tp_feature_values.boxplot(ax=axs[0, 0], rot=45)
axs[0, 0].set_title('Verdadeiros positivos')

fp_feature_values.boxplot(ax=axs[0, 1], rot=45)
axs[0, 1].set_title('Falsos positivos')

fn_feature_values.boxplot(ax=axs[1, 0], rot=45)
axs[1, 0].set_title('Falsos negativos')

tn_feature_values.boxplot(ax=axs[1, 1], rot=45)
axs[1, 1].set_title('Verdadeiros Negativos')

fig.tight_layout()
plt.show()




## Hipóteses de teste para falsos positivos
- Similaridade não é boa o suficiente
- A palavra substituída é genérica demais
- O idf dela é alto demais

In [None]:
fig, ax = plt.subplots(figsize=(16, 12))

tp_feature_values.boxplot(column="words_semantic_similarity", ax=ax, positions=[1], widths=0.6, patch_artist=True, boxprops=dict(facecolor="skyblue"))
fp_feature_values.boxplot(column="words_semantic_similarity", ax=ax, positions=[2], widths=0.6, patch_artist=True, boxprops=dict(facecolor="salmon"))

ax.set_title('Comparação da similaridade semântica entre verdadeiros positivos e falsos positivos')
ax.set_xticks([1, 2])
ax.set_xticklabels(['Verdadeiros positivos', 'Falsos positivos'])

ax.set_ylim(0.6, 1.0)

plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(16, 12))

tp_feature_values.boxplot(column="spearman_rank_correlation", ax=ax, positions=[1], widths=0.6, patch_artist=True, boxprops=dict(facecolor="skyblue"))
fp_feature_values.boxplot(column="spearman_rank_correlation", ax=ax, positions=[2], widths=0.6, patch_artist=True, boxprops=dict(facecolor="salmon"))

ax.set_title('Comparação da correlação Spearman entre verdadeiros positivos e falsos positivos')
ax.set_xticks([1, 2])
ax.set_xticklabels(['Verdadeiros positivos', 'Falsos positivos'])

# ax.set_ylim(0.6, 1.0)

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16, 12))

tp_feature_values.boxplot(column="expansion_idf", ax=ax, positions=[1], widths=0.6, patch_artist=True, boxprops=dict(facecolor="skyblue"))
fp_feature_values.boxplot(column="expansion_idf", ax=ax, positions=[2], widths=0.6, patch_artist=True, boxprops=dict(facecolor="salmon"))

ax.set_title('Comparação de idf da palavra expandida entre verdadeiros positivos e falsos positivos')
ax.set_xticks([1, 2])
ax.set_xticklabels(['Verdadeiros positivos', 'Falsos positivos'])

# ax.set_ylim(0.6, 1.0)

plt.show()