## Importando bibliotecas

In [None]:
import pandas as pd
import json
import requests as r

## Pegando dados do arquivo json com método GET

In [None]:
dados = r.get('https://raw.githubusercontent.com/gazzola/dataset_students/main/food_database.json')

In [None]:
dados

<Response [200]>

In [None]:
type(dados)

## Coletando JSON

In [None]:
bd = dados.json()

## Informando tamanho da base de dados

In [None]:
len(bd)

6636

## Mostrando nutrientes do primeiro alimento

In [None]:
bd[0]["nutrients"]

[{'value': 25.18,
  'units': 'g',
  'description': 'Protein',
  'group': 'Composition'},
 {'value': 29.2,
  'units': 'g',
  'description': 'Total lipid (fat)',
  'group': 'Composition'},
 {'value': 3.06,
  'units': 'g',
  'description': 'Carbohydrate, by difference',
  'group': 'Composition'},
 {'value': 3.28, 'units': 'g', 'description': 'Ash', 'group': 'Other'},
 {'value': 376.0, 'units': 'kcal', 'description': 'Energy', 'group': 'Energy'},
 {'value': 39.28,
  'units': 'g',
  'description': 'Water',
  'group': 'Composition'},
 {'value': 1573.0, 'units': 'kJ', 'description': 'Energy', 'group': 'Energy'},
 {'value': 0.0,
  'units': 'g',
  'description': 'Fiber, total dietary',
  'group': 'Composition'},
 {'value': 673.0,
  'units': 'mg',
  'description': 'Calcium, Ca',
  'group': 'Elements'},
 {'value': 0.64,
  'units': 'mg',
  'description': 'Iron, Fe',
  'group': 'Elements'},
 {'value': 22.0,
  'units': 'mg',
  'description': 'Magnesium, Mg',
  'group': 'Elements'},
 {'value': 490.0,

## Criando Dataframe de Nutrientes do primeiro alimento

In [None]:
df_exp1 = pd.DataFrame(bd[0]["nutrients"])
df_exp1

Unnamed: 0,value,units,description,group
0,25.180,g,Protein,Composition
1,29.200,g,Total lipid (fat),Composition
2,3.060,g,"Carbohydrate, by difference",Composition
3,3.280,g,Ash,Other
4,376.000,kcal,Energy,Energy
...,...,...,...,...
157,1.472,g,Serine,Amino Acids
158,93.000,mg,Cholesterol,Other
159,18.584,g,"Fatty acids, total saturated",Other
160,8.275,g,"Fatty acids, total monounsaturated",Other


## Criando dataframe 2 com colunas específicas e dando head para demonstrar

In [None]:
df_exp2 = pd.DataFrame(bd)[["description", "group", "id", "manufacturer"]]
df_exp2.head()

Unnamed: 0,description,group,id,manufacturer
0,"Cheese, caraway",Dairy and Egg Products,1008,
1,"Cheese, cheddar",Dairy and Egg Products,1009,
2,"Cheese, edam",Dairy and Egg Products,1018,
3,"Cheese, feta",Dairy and Egg Products,1019,
4,"Cheese, mozzarella, part skim milk",Dairy and Egg Products,1028,


In [None]:
df_exp2

Unnamed: 0,description,group,id,manufacturer
0,"Cheese, caraway",Dairy and Egg Products,1008,
1,"Cheese, cheddar",Dairy and Egg Products,1009,
2,"Cheese, edam",Dairy and Egg Products,1018,
3,"Cheese, feta",Dairy and Egg Products,1019,
4,"Cheese, mozzarella, part skim milk",Dairy and Egg Products,1028,
...,...,...,...,...
6631,"Bologna, beef, low fat",Sausages and Luncheon Meats,42161,
6632,"Turkey and pork sausage, fresh, bulk, patty or...",Sausages and Luncheon Meats,42173,
6633,"Babyfood, juice, pear",Baby Foods,43408,
6634,"Babyfood, dessert, banana yogurt, strained",Baby Foods,43539,


## Calculando memória em uso para o dataframe df_exp2

In [None]:
df_exp2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   description   6636 non-null   object
 1   group         6636 non-null   object
 2   id            6636 non-null   int64 
 3   manufacturer  5195 non-null   object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB


## Procurando dados faltantes no Dataframe

In [None]:
import numpy as np
df_exp2.replace([None, "", " "], np.NaN, inplace=True)

In [None]:
missing_data = df_exp2.isnull().sum()

print(missing_data[missing_data > 0])

manufacturer    5573
dtype: int64


A coluna `manufacturer` possui dados nulos

> Adicionar aspas



## Distribuição dos grupos de alimentos considerando a coluna Group

In [None]:
group_distribution = df_exp2["group"].value_counts()

print(group_distribution)

group
Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Beverages                            278
Soups, Sauces, and Gravies           275
Finfish and Shellfish Products       255
Baby Foods                           209
Cereal Grains and Pasta              183
Ethnic Foods                         165
Snacks                               162
Nut and Seed Products                128
Poultry Products                     116
Sausages and Luncheon Meats          111
Dairy and Egg Products               107
Fats and Oils                         97
Meals, Entrees, and Sidedishes        57
Restaurant Foods                      51
Spices and

## Convertendo todos os nutrientes em um Dataframe e adicionando uma coluna ID -> Atributo identificador, para atender a cada nutriente e posteriormente realizando a concatenação dos respectivos nutrientes

In [None]:
nutrientes = []

for registro in bd:
  reg_nutri = pd.DataFrame(registro['nutrients'])
  reg_nutri["id"] = registro["id"]
  nutrientes.append(reg_nutri)

df_analise = pd.concat(nutrientes, ignore_index = True)

df_analise

Unnamed: 0,value,units,description,group,id
0,25.180,g,Protein,Composition,1008
1,29.200,g,Total lipid (fat),Composition,1008
2,3.060,g,"Carbohydrate, by difference",Composition,1008
3,3.280,g,Ash,Other,1008
4,376.000,kcal,Energy,Energy,1008
...,...,...,...,...,...
389350,0.000,mcg,"Vitamin B-12, added",Vitamins,43546
389351,0.000,mg,Cholesterol,Other,43546
389352,0.072,g,"Fatty acids, total saturated",Other,43546
389353,0.028,g,"Fatty acids, total monounsaturated",Other,43546


## Pegando quantidade de Linhas x Colunas do dataset

In [None]:
df_analise.shape

(389355, 5)

## Capturando linhas duplicadas do Dataframe

In [None]:
duplicated_rows = df_analise.duplicated().sum()

print(f"Existem {duplicated_rows} linhas duplicadas no DataFrame.")

Existem 14179 linhas duplicadas no DataFrame.


## Deduplicando dados -> Removendo duplicatas do Dataset

In [None]:
df_analise_cleaned = df_analise.drop_duplicates()

duplicated_rows = df_analise_cleaned.duplicated().sum()

print(f"Existem {duplicated_rows} linhas duplicadas no DataFrame.")

Existem 0 linhas duplicadas no DataFrame.


In [None]:
df_analise_cleaned.shape

(375176, 5)