In [1]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import joblib

test_data = pd.read_csv('../data/projeto_va1_base_de_dados_teste_sem_target.csv')

categorical_columns = ['UF', 'SEXO', 'INSTRU', 'RACA_COR', 'BODY_SYSTEM']
test_data = pd.get_dummies(test_data, columns=categorical_columns, drop_first=True)

train_val_data = pd.read_csv('../data/projeto_va1_base_de_dados_train_val.csv')
train_val_data = pd.get_dummies(train_val_data, columns=categorical_columns, drop_first=True)
missing_cols = set(train_val_data.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[train_val_data.columns.drop('H_COST')]

# Carregar o modelo treinado e o escalador
model = joblib.load('../outputs/modelo_arvore_decisao.pkl')
scaler = joblib.load('../outputs/scaler.pkl')

test_data_scaled = scaler.transform(test_data)

test_predictions = model.predict(test_data_scaled)

# Salvando os resultados das predições
result = pd.DataFrame(test_predictions, columns=['H_COST_PRED'])
result.to_csv('../outputs/resultado_final_com_predicoes.csv', index=False)


X_val_scaled = scaler.transform(train_val_data.drop(columns=['H_COST']))
y_val = train_val_data['H_COST']
y_val_pred = model.predict(X_val_scaled)

final_accuracy = accuracy_score(y_val, y_val_pred)
final_f1_score = f1_score(y_val, y_val_pred, average='weighted')
classification_rep = classification_report(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)
mean_score = (final_accuracy + final_f1_score) / 2

print("Acurácia final:", final_accuracy)
print("F1-score final:", final_f1_score)
print("Relatório de classificação final:\n", classification_rep)
print("Matriz de confusão final:\n", conf_matrix)
print("\nMédia final entre Acurácia e F1-score:", mean_score)


Acurácia final: 0.979333563750697
F1-score final: 0.977873450567276
Relatório de classificação final:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99   3512575
           1       0.90      0.66      0.76    185063

    accuracy                           0.98   3697638
   macro avg       0.94      0.83      0.88   3697638
weighted avg       0.98      0.98      0.98   3697638

Matriz de confusão final:
 [[3498232   14343]
 [  62074  122989]]

Média final entre Acurácia e F1-score: 0.9786035071589865
