# Formatação das tabelas


## Setup

### Imports

In [None]:
import pandas as pd
import re 
import unicodedata 

### Baixar o CSV do site kaggle.com

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("claudiapastorello/tbca-tabela-brasileira-de-composio-alimentos")

print("Path to dataset files:", path)

### Função para remover acentuação


In [None]:
def remove_accents(input_str):
    if not isinstance(input_str, str):
        return input_str

    text_normalized = unicodedata.normalize('NFKD', input_str)
    only_ascii = text_normalized.encode('ASCII', 'ignore')
    return only_ascii.decode('utf-8')

# Tabela TBCA

## Hands On

### Carregando CSV

In [None]:
tbca = pd.read_csv("../../../docs/TBCA_20223103csv.csv", sep=";", usecols=range(39))
# sep - usa ';' como separador de colunas
# usecols - carrega as primeiras 39 colunas      

tbca.head(10) #39 primeiras colunas relevantes

### Tratando os dados

In [None]:
formatted_tbca = tbca.copy()

# formatted_tbca.replace(
#     [pd.NA, "tr"], "0"
# )

for col in formatted_tbca.columns:
    formatted_tbca[col] = formatted_tbca[col].replace(pd.NA, "0")
    formatted_tbca[col] = formatted_tbca[col].replace("tr", "0")

formatted_tbca.iloc[:, 2:]

formatted_tbca = formatted_tbca.map(
    lambda x: remove_accents(x) if isinstance(x, str) else x
)

formatted_tbca.columns = (
    formatted_tbca.columns
    .str.lower()
    .str.strip()
    .str.replace(" ", "_")
    .map(remove_accents)
)

if "alimento" in formatted_tbca.columns:
    formatted_tbca["alimento"] = formatted_tbca["alimento"].str.lower()

formatted_tbca.to_csv("../../../docs/formattedTBCA.csv", index=False, sep=";")
formatted_tbca.head(20).iloc[0, :]

# Tabela TACO

## Hands On  

### Carregando CSV

In [None]:
taco = pd.read_csv("../../../docs/datasets/TACO.csv", sep=",", usecols=range(13))


taco.head(10)

### Tratando os dados

In [None]:
formatted_taco = taco.copy()

# formatted_taco.replace(
#     [pd.NA, " NA", " Tr", " "], "0"
# )

for col in formatted_taco.columns:
    formatted_taco[col] = formatted_taco[col].replace(pd.NA, "0")
    formatted_taco[col] = formatted_taco[col].replace(" NA", "0")
    formatted_taco[col] = formatted_taco[col].replace(" Tr", "0")
    formatted_taco[col] = formatted_taco[col].replace(" ", "0")
    formatted_taco[col] = formatted_taco[col].replace(" *", "0")
    formatted_taco[col] = formatted_taco[col].apply(remove_accents)

# print(type(formattedTACO[' Colesterol (mg)'].iloc[6])) 

formatted_taco.columns = (
    formatted_taco.columns
    .str.lower()
    .str.strip()
    .str.replace(" ", "_")
    .map(remove_accents)
)

if "nome" in formatted_taco.columns:
    formatted_taco["nome"] = formatted_taco["nome"].str.lower()

formatted_taco.to_csv("../../../docs/datasets/formattedTACO.csv", index=False)
formatted_taco.head(20)