In [85]:
import pandas as pd
import json
import numpy as np

In [86]:
df1 = pd.DataFrame({'id': [1,2,3], 'nome': ['ALAN', 'BRENDA', 'SANDRA']})

df2 = pd.DataFrame({'id': [2, 3, 4], 'idade': ['7', '36', '37']})

In [87]:
print(df1)
print(df2)

   id    nome
0   1    ALAN
1   2  BRENDA
2   3  SANDRA
   id idade
0   2     7
1   3    36
2   4    37


In [88]:
df_inner = pd.merge(df1, df2, on='id', how='inner') # Mostra somente as chaves com valores iguais
df_inner

Unnamed: 0,id,nome,idade
0,2,BRENDA,7
1,3,SANDRA,36


In [89]:
df_left = pd.merge(df1, df2, on='id', how='left') # Mostra todos os valores da df1, mesmo que não tenha correspondência na df2
df_left

Unnamed: 0,id,nome,idade
0,1,ALAN,
1,2,BRENDA,7.0
2,3,SANDRA,36.0


In [90]:
df_right = pd.merge(df1, df2, on='id', how='right') # Mostra todos os valores da df2, mesmo que não tenha correspondência na df1
df_right

Unnamed: 0,id,nome,idade
0,2,BRENDA,7
1,3,SANDRA,36
2,4,,37


In [91]:
df_outer = pd.merge(df1, df2, on='id', how='outer') # Mostra todos os valores das duas df, mesmo que não tenha correspondência
df_outer

Unnamed: 0,id,nome,idade
0,1,ALAN,
1,2,BRENDA,7.0
2,3,SANDRA,36.0
3,4,,37.0


In [92]:
df_vendas = pd.DataFrame({
    'loja': ['A', 'A', 'B', 'C', 'B', 'C'],
    'categoria': ['Eletro', 'Comida', 'Eletro', 'comida', 'Eletro', 'Eletro'],
    'valor': [100, 300, 400, 100, 200, 2000],
    'estoque': [12, 32, 15, 1231, 923, 512]
})

df_media = df_vendas.groupby('loja')['valor'].sum() # Calcula a soma dos valores por loja
df_media

df_grouped = df_vendas.groupby(['loja', 'categoria'])['valor'].sum() # Calcula a soma dos valores por loja e categoria
df_grouped

loja  categoria
A     Comida        300
      Eletro        100
B     Eletro        600
C     Eletro       2000
      comida        100
Name: valor, dtype: int64

In [93]:
df_grouped = df_vendas.groupby('loja').agg({ # Calcula a soma, média e máximo dos valores por loja
    'valor': ['sum', 'mean', 'max'],  # Metodos a aplicar para cada coluna 'valor'
    'estoque': ['count', 'min'] # Calcula a contagem, mínimo e máximo do estoque por loja
})

df_grouped

Unnamed: 0_level_0,valor,valor,valor,estoque,estoque
Unnamed: 0_level_1,sum,mean,max,count,min
loja,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A,400,200.0,300,2,12
B,600,300.0,400,2,15
C,2100,1050.0,2000,2,512


In [94]:
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [95]:
df_raiz = pd.DataFrame([{
    'bid': d.get('business_id'),
    'name': d.get('name'),
    'cidade': d.get('city'),
    'stars': d.get('stars'),
    'revs': d.get('review_count'),
    'estado': d.get('state'),
    'categoria': d.get('categories')
} for d in data])

In [96]:
df_raiz.fillna('Desconhecido', inplace= True)
df_raiz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   bid        150346 non-null  object 
 1   name       150346 non-null  object 
 2   cidade     150346 non-null  object 
 3   stars      150346 non-null  float64
 4   revs       150346 non-null  int64  
 5   estado     150346 non-null  object 
 6   categoria  150346 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 8.0+ MB


In [97]:
df_atributos = pd.DataFrame([{
    'bid': d.get('business_id'),
    'latitude': d.get('latitude'),
    'longitude': d.get('longitude')
} for d in data])

In [98]:
df_atributos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   bid        150346 non-null  object 
 1   latitude   150346 non-null  float64
 2   longitude  150346 non-null  float64
dtypes: float64(2), object(1)
memory usage: 3.4+ MB


In [99]:
weekdays = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
df_horas = pd.DataFrame([{
    'bid': d.get('business_id'),
    **{day: (d.get('hours') or {}) .get(day, '') for day in weekdays} 
    # Busca o horário para cada dia da semana no dicionário 'hours' ou usa '' caso não seja encontrado.
} for d in data])

df_horas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   bid        150346 non-null  object
 1   Sunday     150346 non-null  object
 2   Monday     150346 non-null  object
 3   Tuesday    150346 non-null  object
 4   Wednesday  150346 non-null  object
 5   Thursday   150346 non-null  object
 6   Friday     150346 non-null  object
 7   Saturday   150346 non-null  object
dtypes: object(8)
memory usage: 9.2+ MB


In [100]:
df_completa  = df_raiz.merge(df_atributos, on='bid', how='left').merge(df_horas, on= 'bid', how='left') 
# Junta os dataframes por 'bid' (business_id)
df_completa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 16 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   bid        150346 non-null  object 
 1   name       150346 non-null  object 
 2   cidade     150346 non-null  object 
 3   stars      150346 non-null  float64
 4   revs       150346 non-null  int64  
 5   estado     150346 non-null  object 
 6   categoria  150346 non-null  object 
 7   latitude   150346 non-null  float64
 8   longitude  150346 non-null  float64
 9   Sunday     150346 non-null  object 
 10  Monday     150346 non-null  object 
 11  Tuesday    150346 non-null  object 
 12  Wednesday  150346 non-null  object 
 13  Thursday   150346 non-null  object 
 14  Friday     150346 non-null  object 
 15  Saturday   150346 non-null  object 
dtypes: float64(3), int64(1), object(12)
memory usage: 18.4+ MB


In [101]:
df_completa['categoria']
df_completasplit = df_completa.assign(categoria= df_completa['categoria'].str.split(', ')) 
# Transforma a coluna 'categoria' em uma lista de strings

df_completasplit.head()

Unnamed: 0,bid,name,cidade,stars,revs,estado,categoria,latitude,longitude,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,5.0,7,CA,"[Doctors, Traditional Chinese Medicine, Naturo...",34.426679,-119.711197,,,,,,,
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,Affton,3.0,15,MO,"[Shipping Centers, Local Services, Notaries, M...",38.551126,-90.335695,,0:0-0:0,8:0-18:30,8:0-18:30,8:0-18:30,8:0-18:30,8:0-14:0
2,tUFrWirKiKi_TAnsVWINQQ,Target,Tucson,3.5,22,AZ,"[Department Stores, Shopping, Fashion, Home & ...",32.223236,-110.880452,8:0-22:0,8:0-22:0,8:0-22:0,8:0-22:0,8:0-22:0,8:0-23:0,8:0-23:0
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,4.0,80,PA,"[Restaurants, Food, Bubble Tea, Coffee & Tea, ...",39.955505,-75.155564,7:0-21:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-21:0,7:0-21:0
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,Green Lane,4.5,13,PA,"[Brewpubs, Breweries, Food]",40.338183,-75.471659,12:0-18:0,,,14:0-22:0,16:0-22:0,12:0-22:0,12:0-22:0


In [102]:
df_exp = df_completasplit.explode('categoria') # Explode a coluna 'categoria' em uma nova coluna para cada item da lista

df_exp.head()


Unnamed: 0,bid,name,cidade,stars,revs,estado,categoria,latitude,longitude,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,5.0,7,CA,Doctors,34.426679,-119.711197,,,,,,,
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,5.0,7,CA,Traditional Chinese Medicine,34.426679,-119.711197,,,,,,,
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,5.0,7,CA,Naturopathic/Holistic,34.426679,-119.711197,,,,,,,
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,5.0,7,CA,Acupuncture,34.426679,-119.711197,,,,,,,
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,5.0,7,CA,Health & Medical,34.426679,-119.711197,,,,,,,


In [103]:
df_counts = df_exp['categoria'].value_counts() # Conta a quantidade de vezes cada categoria aparece
df_counts.head(20)

categoria
Restaurants                  52268
Food                         27781
Shopping                     24395
Home Services                14356
Beauty & Spas                14292
Nightlife                    12281
Health & Medical             11890
Local Services               11198
Bars                         11065
Automotive                   10773
Event Planning & Services     9895
Sandwiches                    8366
American (Traditional)        8139
Active Life                   7687
Pizza                         7093
Coffee & Tea                  6703
Fast Food                     6472
Breakfast & Brunch            6239
American (New)                6097
Hotels & Travel               5857
Name: count, dtype: int64

In [104]:
df_exp['latitude'] = pd.to_numeric(df_exp['latitude'], errors='coerce') 
# Converte os valores da coluna 'latitude' para numéricos, ignorando valores inválidos
df_exp['latitude'].info() 

<class 'pandas.core.series.Series'>
Index: 668695 entries, 0 to 150345
Series name: latitude
Non-Null Count   Dtype  
--------------   -----  
668695 non-null  float64
dtypes: float64(1)
memory usage: 10.2 MB


In [105]:
df_exp['latround'] = df_exp['latitude'].round(0) 
# Arredonda os valores da coluna 'latitude' para o inteiro mais próximo

df_exp['lonround'] = df_exp['longitude'].round(0) 
# Arredonda os valores da coluna 'longitude' para o inteiro mais próximo

In [106]:
df_exp[['latround', 'lonround', 'cidade' ]].value_counts().head(20) 
# Conta a quantidade de restaurantes por cidade e latitude/longitude arredondados

latround  lonround  cidade          
40.0      -75.0     Philadelphia        63824
32.0      -111.0    Tucson              40204
40.0      -86.0     Indianapolis        33491
36.0      -87.0     Nashville           32373
30.0      -90.0     New Orleans         28142
28.0      -82.0     Tampa               22400
39.0      -90.0     Saint Louis         20347
28.0      -83.0     Tampa               18885
34.0      -120.0    Santa Barbara       18014
40.0      -120.0    Reno                15276
44.0      -116.0    Boise               13233
39.0      -120.0    Reno                11593
28.0      -83.0     Clearwater           9966
54.0      -114.0    Edmonton             9120
28.0      -83.0     Saint Petersburg     7472
30.0      -90.0     Metairie             7464
40.0      -120.0    Sparks               7210
54.0      -113.0    Edmonton             6454
36.0      -87.0     Franklin             6452
40.0      -76.0     Wilmington           5969
Name: count, dtype: int64

In [107]:
df_filter = df_exp[df_exp['revs'] > 100] # Filtra os restaurantes com mais de 100 reviews
df_filter

Unnamed: 0,bid,name,cidade,stars,revs,estado,categoria,latitude,longitude,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,latround,lonround
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,Philadelphia,4.0,245,PA,Sushi Bars,39.953949,-75.143226,13:30-22:0,,13:30-22:0,13:30-22:0,13:30-22:0,13:30-23:0,13:30-23:0,40.0,-75.0
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,Philadelphia,4.0,245,PA,Restaurants,39.953949,-75.143226,13:30-22:0,,13:30-22:0,13:30-22:0,13:30-22:0,13:30-23:0,13:30-23:0,40.0,-75.0
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,Philadelphia,4.0,245,PA,Japanese,39.953949,-75.143226,13:30-22:0,,13:30-22:0,13:30-22:0,13:30-22:0,13:30-23:0,13:30-23:0,40.0,-75.0
19,ROeacJQwBeh05Rqg7F6TCg,BAP,Philadelphia,4.5,205,PA,Korean,39.943223,-75.162568,,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,40.0,-75.0
19,ROeacJQwBeh05Rqg7F6TCg,BAP,Philadelphia,4.5,205,PA,Restaurants,39.943223,-75.162568,,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,11:30-20:30,40.0,-75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150323,w_4xUt-1AyY2ZwKtnjW0Xg,Bittercreek Alehouse,Boise,4.5,998,ID,Bars,43.616590,-116.202383,11:0-22:0,0:0-0:0,11:0-22:0,11:0-22:0,11:0-23:0,11:0-23:30,11:0-23:0,44.0,-116.0
150323,w_4xUt-1AyY2ZwKtnjW0Xg,Bittercreek Alehouse,Boise,4.5,998,ID,Gastropubs,43.616590,-116.202383,11:0-22:0,0:0-0:0,11:0-22:0,11:0-22:0,11:0-23:0,11:0-23:30,11:0-23:0,44.0,-116.0
150323,w_4xUt-1AyY2ZwKtnjW0Xg,Bittercreek Alehouse,Boise,4.5,998,ID,Sandwiches,43.616590,-116.202383,11:0-22:0,0:0-0:0,11:0-22:0,11:0-22:0,11:0-23:0,11:0-23:30,11:0-23:0,44.0,-116.0
150323,w_4xUt-1AyY2ZwKtnjW0Xg,Bittercreek Alehouse,Boise,4.5,998,ID,Nightlife,43.616590,-116.202383,11:0-22:0,0:0-0:0,11:0-22:0,11:0-22:0,11:0-23:0,11:0-23:30,11:0-23:0,44.0,-116.0


In [117]:
df_regiao = df_filter[df_filter['latround']<31]
df_regiao = df_regiao[df_regiao['lonround']> -88]
df_filter['estado'].value_counts()

estado
PA    17484
FL    13394
LA     7732
TN     7240
MO     5980
IN     5898
NV     4856
AZ     4684
CA     4137
NJ     2414
ID     1812
DE      691
AB      483
IL      386
Name: count, dtype: int64

In [119]:
df_regiao['estado'].value_counts()

estado
FL    13394
Name: count, dtype: int64