In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import joblib


train_val_data = pd.read_csv('../data/projeto_va1_base_de_dados_train_val.csv')
test_data = pd.read_csv('../data/projeto_va1_base_de_dados_teste_sem_target.csv')

train_val_data.fillna(0, inplace=True)
categorical_columns = ['UF', 'SEXO', 'INSTRU', 'RACA_COR', 'BODY_SYSTEM']
train_val_data = pd.get_dummies(train_val_data, columns=categorical_columns, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_columns, drop_first=True)

missing_cols = set(train_val_data.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[train_val_data.columns.drop('H_COST')]

# Separando os dados em features (X) e target (y)
X = train_val_data.drop(columns=['H_COST'])
y = train_val_data['H_COST']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Escalando os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)
final_accuracy = accuracy_score(y_val, y_val_pred)
final_f1_score = f1_score(y_val, y_val_pred, average='weighted')
classification_rep = classification_report(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)
mean_score = (final_accuracy + final_f1_score) / 2

print("Acurácia final:", final_accuracy)
print("F1-score final:", final_f1_score)
print("Relatório de classificação final:\n", classification_rep)
print("Matriz de confusão final:\n", conf_matrix)
print("\nMédia final entre Acurácia e F1-score:", mean_score)

# Salvando o modelo e o escalador
joblib.dump(model, '../outputs/modelo_arvore_decisao.pkl')
joblib.dump(scaler, '../outputs/scaler.pkl')

test_predictions = model.predict(test_data_scaled)

# Salvando os pré resultados
result = pd.DataFrame(test_predictions, columns=['H_COST_PRED'])
result.to_csv('../outputs/pre_resultado_com_predicoes.csv', index=False)


Acurácia final: 0.9595633971938858
F1-score final: 0.9559360470928457
Relatório de classificação final:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98    702293
           1       0.64      0.44      0.52     37235

    accuracy                           0.96    739528
   macro avg       0.81      0.71      0.75    739528
weighted avg       0.95      0.96      0.96    739528

Matriz de confusão final:
 [[693226   9067]
 [ 20837  16398]]

Média final entre Acurácia e F1-score: 0.9577497221433657
