In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# 1. Carregamento dos Dados
df = pd.read_csv('Obesity.csv')

# 2. Limpeza de Dados (Data Cleaning)
# Arredondando as colunas que possuem ru√≠do decimal para o inteiro mais pr√≥ximo
colunas_com_ruido = ['FCVC', 'NCP', 'CH20', 'FAF', 'TUE']

# Verifica se as colunas existem antes de aplicar (previne erros se o CSV tiver varia√ß√µes de nome, como CH2O em vez de CH20)
for col in colunas_com_ruido:
    if col in df.columns:
        df[col] = df[col].round().astype(int)
    else:
        # Tratamento de fallback caso o CSV original difira levemente do PDF
        if col == 'CH20' and 'CH2O' in df.columns:
             df['CH2O'] = df['CH2O'].round().astype(int)
        if col == 'TUE' and 'TER' in df.columns:
             df['TER'] = df['TER'].round().astype(int)

# 3. Engenharia de Atributos (Feature Engineering)
# Criando a coluna de IMC (√çndice de Massa Corporal)
df['IMC'] = df['Weight'] / (df['Height'] ** 2)

# 4. Separa√ß√£o das Vari√°veis
X = df.drop('Obesity', axis=1) # Atualizado para o nome correto do dicion√°rio
y = df['Obesity']

# Codificando a vari√°vel alvo (Target)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 5. Configura√ß√£o do Pr√©-processamento
# Atualizando as listas com os nomes exatos do dicion√°rio
# Verificamos dinamicamente os nomes de CH20/CH2O e TUE/TER para evitar quebra no Pipeline
agua_col = 'CH20' if 'CH20' in df.columns else 'CH2O'
tech_col = 'TUE' if 'TUE' in df.columns else 'TER'

num_features = ['Age', 'Height', 'Weight', 'IMC', 'FCVC', 'NCP', agua_col, 'FAF', tech_col]
cat_features = ['Gender', 'family_history', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
    ])

# 6. Constru√ß√£o do Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1))
])

# 7. Divis√£o de Treino e Teste (80% treino, 20% teste)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# 8. Treinamento do Modelo
pipeline.fit(X_train, y_train)

# 9. Avalia√ß√£o
y_pred = pipeline.predict(X_test)
acuracia = accuracy_score(y_test, y_pred)

print(f"üéØ Acur√°cia do Modelo: {acuracia * 100:.2f}%\n")
print("üìä Relat√≥rio de Classifica√ß√£o:")
target_names = le.inverse_transform(np.unique(y_encoded))
print(classification_report(y_test, y_pred, target_names=target_names))

# 10. Exporta√ß√£o dos Artefatos para o Deploy
joblib.dump(pipeline, 'pipeline_obesidade.pkl')
joblib.dump(le, 'label_encoder.pkl')
print("\n‚úÖ Arquivos 'pipeline_obesidade.pkl' e 'label_encoder.pkl' salvos com sucesso na pasta fiap_challenge4!")

üéØ Acur√°cia do Modelo: 98.35%

üìä Relat√≥rio de Classifica√ß√£o:
                     precision    recall  f1-score   support

Insufficient_Weight       1.00      0.98      0.99        54
      Normal_Weight       0.93      0.98      0.96        58
     Obesity_Type_I       0.99      1.00      0.99        70
    Obesity_Type_II       0.98      0.98      0.98        60
   Obesity_Type_III       1.00      0.98      0.99        65
 Overweight_Level_I       0.98      0.95      0.96        58
Overweight_Level_II       1.00      1.00      1.00        58

           accuracy                           0.98       423
          macro avg       0.98      0.98      0.98       423
       weighted avg       0.98      0.98      0.98       423


‚úÖ Arquivos 'pipeline_obesidade.pkl' e 'label_encoder.pkl' salvos com sucesso na pasta fiap_challenge4!
