In [None]:

# ====================================================
# CLASSIFICAÇÃO DE CATEGORIA NUTRICIONAL 
# ====================================================

# 1′️⃣ Importar bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ====================================================
# 2′️⃣ Carregar o dataset
# ====================================================
# Se estiver no Google Colab, antes execute:
# from google.colab import files
# uploaded = files.upload()

# Extrair o arquivo zip
zip_file_path = "/food.csv (1) (1).zip"
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(".")

df = pd.read_csv("food.csv")

print(" Dimensões do dataset:", df.shape)
display(df.head())

# ====================================================
# 3′️⃣ Análise básica
# ====================================================
print("\n Informações gerais:")
display(df.info())

print("\n Valores ausentes por coluna:")
display(df.isnull().sum())

# ====================================================
# 4′️⃣ Definir features (X) e target (y)
# ====================================================
# Substitua 'categoria_nutricional' pelo nome EXATO da coluna no seu arquivo
# target_column = 'categoria_nutricional'  # <--- confirme se é esse o nome exato!

# Mostrando as colunas disponíveis para ajudar a identificar o nome correto
print("\nColunas disponíveis no DataFrame:")
print(df.columns.tolist())

# O erro 'KeyError: 'categoria_nutricional'' ocorreu porque esta coluna não existe.
# Por favor, verifique a lista acima e defina a 'target_column' com o nome exato da coluna alvo.
# Por exemplo, se a coluna alvo for 'Category', você usaria: target_column = 'Category'

# Com base no head do dataframe, 'Category' parece ser uma candidata para a coluna alvo.
# Vou tentar usar 'Category' como a coluna alvo.
target_column = 'Category'
y = df[target_column]
X = df.drop(columns=[target_column])

# ====================================================
# 5′️⃣ Pré-processamento
# ====================================================

# Tratar colunas categóricas
X = pd.get_dummies(X, drop_first=True)

# Remover linhas com NaN restantes
X = X.dropna()
y = y.loc[X.index]

# Codificar o target se for texto
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Identificar e remover categorias com apenas uma ocorrância para evitar erro de estratificação
# Convertendo y para Series para usar value_counts
y_series = pd.Series(y)
single_occurrence_classes = y_series.value_counts()[y_series.value_counts() < 2].index

# Filtrar X e y
indices_to_keep = ~y_series.isin(single_occurrence_classes)
X = X[indices_to_keep]
y = y_series[indices_to_keep].values # Converter de volta para numpy array para consistãncia

# Padronizar features numéricas
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ====================================================
# 6′️⃣ Separar treino e teste
# ====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

# ====================================================
# 7′️⃣ Treinar modelo
# ====================================================
model = DecisionTreeClassifier(max_depth=4, random_state=42)
model.fit(X_train, y_train)

# ====================================================
# 8′️⃣ Avaliação
# ====================================================
y_pred = model.predict(X_test)

print("\n Relatório de Classificação:")
print(classification_report(y_test, y_pred))

print(f"Acurácia geral: {accuracy_score(y_test, y_pred):.2f}")

# ====================================================
# 9′️⃣ Matriz de Confusão
# ====================================================
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.title("Matriz de Confusão ½")
plt.xlabel("Previsto")
plt.ylabel("Real")
plt.show()

# ====================================================
# 1′️⃣0′️⃣ Importância das variáveis
# ====================================================
importances = pd.Series(model.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=True)

plt.figure(figsize=(8,6))
importances.tail(15).plot(kind='barh', color='mediumseagreen')
plt.title("Importância das Principais Variáveis § ")
plt.xlabel("Importância")
plt.ylabel("Atributos")
plt.show()

# ====================================================
# 1′️⃣1′️⃣ Visualização da árvore de decisão
# ====================================================
plt.figure(figsize=(18,10))
plot_tree(model,
          filled=True,
          rounded=True,
          fontsize=8,
          feature_names=X.columns,
          class_names=[str(c) for c in np.unique(y)])
plt.title("Árvore de Decisão ³")
plt.show()

 Dimensões do dataset: (7413, 48)


Unnamed: 0,Category,Description,Nutrient Data Bank Number,Data.Alpha Carotene,Data.Ash,Data.Beta Carotene,Data.Beta Cryptoxanthin,Data.Carbohydrate,Data.Cholesterol,Data.Choline,...,Data.Major Minerals.Potassium,Data.Major Minerals.Sodium,Data.Major Minerals.Zinc,Data.Vitamins.Vitamin A - IU,Data.Vitamins.Vitamin A - RAE,Data.Vitamins.Vitamin B12,Data.Vitamins.Vitamin B6,Data.Vitamins.Vitamin C,Data.Vitamins.Vitamin E,Data.Vitamins.Vitamin K
0,BUTTER,"BUTTER,WITH SALT",1001,0,2.11,158,0,0.06,215,19,...,24,576,0.09,2499,684,0.17,0.003,0.0,2.32,7.0
1,BUTTER,"BUTTER,WHIPPED,WITH SALT",1002,0,2.11,158,0,0.06,219,19,...,26,827,0.05,2499,684,0.13,0.003,0.0,2.32,7.0
2,BUTTER OIL,"BUTTER OIL,ANHYDROUS",1003,0,0.0,193,0,0.0,256,22,...,5,2,0.01,3069,840,0.01,0.001,0.0,2.8,8.6
3,CHEESE,"CHEESE,BLUE",1004,0,5.11,74,0,2.34,75,15,...,256,1395,2.66,763,198,1.22,0.166,0.0,0.25,2.4
4,CHEESE,"CHEESE,BRICK",1005,0,3.18,76,0,2.79,94,15,...,136,560,2.6,1080,292,1.26,0.065,0.0,0.26,2.5



 Informações gerais:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7413 entries, 0 to 7412
Data columns (total 48 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   Category                                                 7413 non-null   object 
 1   Description                                              7413 non-null   object 
 2   Nutrient Data Bank Number                                7413 non-null   int64  
 3   Data.Alpha Carotene                                      7413 non-null   int64  
 4   Data.Ash                                                 7413 non-null   float64
 5   Data.Beta Carotene                                       7413 non-null   int64  
 6   Data.Beta Cryptoxanthin                                  7413 non-null   int64  
 7   Data.Carbohydrate                                        7413 non-null   float64
 8   Dat

None


 Valores ausentes por coluna:


Unnamed: 0,0
Category,0
Description,0
Nutrient Data Bank Number,0
Data.Alpha Carotene,0
Data.Ash,0
Data.Beta Carotene,0
Data.Beta Cryptoxanthin,0
Data.Carbohydrate,0
Data.Cholesterol,0
Data.Choline,0



Colunas disponíveis no DataFrame:
['Category', 'Description', 'Nutrient Data Bank Number', 'Data.Alpha Carotene', 'Data.Ash', 'Data.Beta Carotene', 'Data.Beta Cryptoxanthin', 'Data.Carbohydrate', 'Data.Cholesterol', 'Data.Choline', 'Data.Fiber', 'Data.Kilocalories', 'Data.Lutein and Zeaxanthin', 'Data.Lycopene', 'Data.Manganese', 'Data.Niacin', 'Data.Pantothenic Acid', 'Data.Protein', 'Data.Refuse Percentage', 'Data.Retinol', 'Data.Riboflavin', 'Data.Selenium', 'Data.Sugar Total', 'Data.Thiamin', 'Data.Water', 'Data.Fat.Monosaturated Fat', 'Data.Fat.Polysaturated Fat', 'Data.Fat.Saturated Fat', 'Data.Fat.Total Lipid', 'Data.Household Weights.1st Household Weight', 'Data.Household Weights.1st Household Weight Description', 'Data.Household Weights.2nd Household Weight', 'Data.Household Weights.2nd Household Weight Description', 'Data.Major Minerals.Calcium', 'Data.Major Minerals.Copper', 'Data.Major Minerals.Iron', 'Data.Major Minerals.Magnesium', 'Data.Major Minerals.Phosphorus', 'Data

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
