In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
import joblib
from sklearn.utils.class_weight import compute_sample_weight


df_dados = pd.read_csv(r"C:\Users\Igor\Documents\GitHub\DATATHON_2026\DATATHON_2026\data\dados_processados\BASE_DE_DADOS_PEDE_2024_DATATHON_LIMPO.csv", sep=';')

# #Variavel de risco
def classificar_defasagem(x):
    if x >= 0:
        return 0   # Em fase
    elif x >= -2:
        return 1   # Moderado
    else:
        return 2   # Severo

df_dados['Risco'] = df_dados['Defasagem'].apply(classificar_defasagem)



features = ['IDA','IEG','IPS','IPP','IAA','IPV','Idade']
X = df_dados[features]
y = df_dados['Risco']


# Treino e Teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,stratify=y 
)

############################### modelo de classificação ###############################


# Treinar modelo
################################### RANDON ##############################################
model_random = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',  
    random_state=42,
    n_jobs=-1             
)
model_random.fit(X_train, y_train)

##################################### XGBOOST ########################################

weights = compute_sample_weight(class_weight='balanced', y=y_train)
model_xg = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    eval_metric='mlogloss'
)

model_xg.fit(X_train, y_train, sample_weight=weights)

######################################## GRADIENT #########################################

model_gradient = GradientBoostingClassifier()
model_gradient.fit(X_train, y_train)


# --- Avaliação Random Forest ---
pred_rf = model_random.predict(X_test)
print("### RELATÓRIO RANDOM FOREST ###")
print(classification_report(y_test, pred_rf))

# --- Avaliação XGBoost ---
pred_xg = model_xg.predict(X_test)
print("\n### RELATÓRIO XGBOOST ###")
print(classification_report(
    y_test, pred_xg,
    target_names=["Em fase", "Moderado", "Severo"]
))

# --- Avaliação Gradient Boosting ---
pred_gb = model_gradient.predict(X_test)
print("\n### RELATÓRIO GRADIENT BOOSTING ###")
print(classification_report(y_test, pred_gb))

### RELATÓRIO RANDOM FOREST ###
              precision    recall  f1-score   support

           0       0.69      0.82      0.75       195
           1       0.81      0.69      0.74       235
           2       0.50      0.43      0.46         7

    accuracy                           0.74       437
   macro avg       0.67      0.64      0.65       437
weighted avg       0.75      0.74      0.74       437


### RELATÓRIO XGBOOST ###
              precision    recall  f1-score   support

     Em fase       0.76      0.78      0.77       195
    Moderado       0.79      0.77      0.78       235
      Severo       0.12      0.14      0.13         7

    accuracy                           0.76       437
   macro avg       0.56      0.56      0.56       437
weighted avg       0.76      0.76      0.76       437


### RELATÓRIO GRADIENT BOOSTING ###
              precision    recall  f1-score   support

           0       0.73      0.76      0.74       195
           1       0.77      0.76

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Optamos por utilizar o modelo Random Forest, pois apresentou um desempenho superior em comparação aos outros modelos testados, com uma melhor precisão e recall para a classe de risco. Além disso, o Random Forest é conhecido por sua robustez e capacidade de lidar com dados complexos, o que o torna uma escolha adequada para este tipo de problema.

In [2]:
##################### Escolha do Modelo #################

joblib.dump(model_random, r"C:\Users\Igor\Documents\GitHub\DATATHON_2026\DATATHON_2026\app\modelo_risco_random.pkl")

['C:\\Users\\Igor\\Documents\\GitHub\\DATATHON_2026\\DATATHON_2026\\app\\modelo_risco_random.pkl']