In [4]:
# Importando Bibliotecas
import os
from google.cloud import bigquery
from dotenv import load_dotenv
import pandas as pd

In [5]:
# Carrega variáveis do .env
load_dotenv()

credencial = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
project_id = os.getenv("PROJECT_ID")
table_id = os.getenv("TABLE_PIZZA")
table_stg = os.getenv("TABLE_STG_OTIMZADO")



In [6]:
# Cria cliente BigQuery
client = bigquery.Client.from_service_account_json(credencial, project=project_id)


query = f"""
SELECT *
FROM `{table_stg}`
"""

In [7]:
# Executa e converte pra DataFrame
resultado = client.query(query)
df = resultado.to_dataframe()



In [8]:
# Verificando as 5 primeiras linhas
df.head()

Unnamed: 0,pizza_id,order_id,pizza_name_id,quantity,order_date,order_time,unit_price,total_price,pizza_size,pizza_category,pizza_ingredients,pizza_name
0,19687,8656,pepperoni_s,1,2015-05-25,15:16:37,9.75,9.75,S,Classic,"Mozzarella Cheese, Pepperoni",The Pepperoni Pizza
1,43343,19056,pepperoni_s,1,2015-11-21,19:19:55,9.75,9.75,S,Classic,"Mozzarella Cheese, Pepperoni",The Pepperoni Pizza
2,35723,15776,pepperoni_s,1,2015-09-21,11:57:50,9.75,9.75,S,Classic,"Mozzarella Cheese, Pepperoni",The Pepperoni Pizza
3,5149,2280,pepperoni_s,1,2015-02-07,19:14:00,9.75,9.75,S,Classic,"Mozzarella Cheese, Pepperoni",The Pepperoni Pizza
4,22415,9847,pepperoni_s,1,2015-06-14,18:10:13,9.75,9.75,S,Classic,"Mozzarella Cheese, Pepperoni",The Pepperoni Pizza


In [9]:
# Verificando as informações do df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48620 entries, 0 to 48619
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pizza_id           48620 non-null  Int64  
 1   order_id           48620 non-null  Int64  
 2   pizza_name_id      48620 non-null  object 
 3   quantity           48620 non-null  Int64  
 4   order_date         48620 non-null  dbdate 
 5   order_time         48620 non-null  dbtime 
 6   unit_price         48620 non-null  float64
 7   total_price        48620 non-null  float64
 8   pizza_size         48620 non-null  object 
 9   pizza_category     48620 non-null  object 
 10  pizza_ingredients  48620 non-null  object 
 11  pizza_name         48620 non-null  object 
dtypes: Int64(3), dbdate(1), dbtime(1), float64(2), object(5)
memory usage: 4.6+ MB


In [10]:
# Subistituindo o tipo de pizza_id
df["pizza_id"] = df["pizza_id"].astype("object")

In [11]:
# criando um dataframe df_dim_pizza
df_dim_pizza = df[['pizza_name_id', 'pizza_name', 'pizza_category', 'pizza_size', 'unit_price']]

In [12]:
# Verificando as informações do df
df_dim_pizza.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48620 entries, 0 to 48619
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pizza_name_id   48620 non-null  object 
 1   pizza_name      48620 non-null  object 
 2   pizza_category  48620 non-null  object 
 3   pizza_size      48620 non-null  object 
 4   unit_price      48620 non-null  float64
dtypes: float64(1), object(4)
memory usage: 1.9+ MB


In [13]:
# Verificando os valores duplicados
df_dim_pizza.duplicated().value_counts()

True     48529
False       91
Name: count, dtype: int64

In [14]:
# Removendo os duplicados 
df_dim_pizza = df_dim_pizza.drop_duplicates(subset=['pizza_name_id'])

In [15]:
# Conferindo os valores duplicados
df_dim_pizza.duplicated().value_counts()

False    91
Name: count, dtype: int64

In [16]:
# Visualizando o df
df_dim_pizza

Unnamed: 0,pizza_name_id,pizza_name,pizza_category,pizza_size,unit_price
0,pepperoni_s,The Pepperoni Pizza,Classic,S,9.75
16,pep_msh_pep_s,"The Pepperoni, Mushroom, and Peppers Pizza",Classic,S,11.00
27,big_meat_s,The Big Meat Pizza,Classic,S,12.00
53,classic_dlx_s,The Classic Deluxe Pizza,Classic,S,12.00
71,green_garden_s,The Green Garden Pizza,Veggie,S,12.00
...,...,...,...,...,...
963,the_greek_l,The Greek Pizza,Classic,L,20.50
966,the_greek_xl,The Greek Pizza,Classic,XL,25.50
1130,ckn_alfredo_s,The Chicken Alfredo Pizza,Chicken,S,12.75
1504,ckn_alfredo_l,The Chicken Alfredo Pizza,Chicken,L,20.75


In [17]:
# adicionando a coluna de data_atualização
from datetime import datetime
df_dim_pizza["data_atualizacao"] = datetime.now()

In [18]:
# Criando a coluna SK
df_dim_pizza = df_dim_pizza.reset_index(drop=True)
df_dim_pizza["sk_pizza"] = df_dim_pizza.index + 1


In [19]:
# Organizando a ordem das colunas

df_dim_pizza = df_dim_pizza[[
    "sk_pizza",
    "pizza_name_id",
    "pizza_name",
    "pizza_category",
    "pizza_size",
    "unit_price",
    "data_atualizacao"
]]

In [20]:
# Visualizando as 5 primeiras linhas
df_dim_pizza.head()

Unnamed: 0,sk_pizza,pizza_name_id,pizza_name,pizza_category,pizza_size,unit_price,data_atualizacao
0,1,pepperoni_s,The Pepperoni Pizza,Classic,S,9.75,2025-12-12 16:17:23.950440
1,2,pep_msh_pep_s,"The Pepperoni, Mushroom, and Peppers Pizza",Classic,S,11.0,2025-12-12 16:17:23.950440
2,3,big_meat_s,The Big Meat Pizza,Classic,S,12.0,2025-12-12 16:17:23.950440
3,4,classic_dlx_s,The Classic Deluxe Pizza,Classic,S,12.0,2025-12-12 16:17:23.950440
4,5,green_garden_s,The Green Garden Pizza,Veggie,S,12.0,2025-12-12 16:17:23.950440


In [21]:
# Pega todos os dados da tabela de dimensão atual
query = f"SELECT * FROM `{table_id}`"
df_dim_atual = client.query(query).to_dataframe()

print(f"Registros atuais na dimensão: {len(df_dim_atual)}")
df_dim_atual.head()




Registros atuais na dimensão: 91


Unnamed: 0,sk_pizza,pizza_name_id,pizza_name,pizza_category,pizza_size,unit_price,data_atualizacao
0,5,thai_ckn_l,The Thai Chicken Pizza,Chicken,L,20.75,2025-11-21 23:41:01.595129
1,9,southw_ckn_l,The Southwest Chicken Pizza,Chicken,L,20.75,2025-11-21 23:41:01.595129
2,14,ckn_pesto_l,The Chicken Pesto Pizza,Chicken,L,20.75,2025-11-21 23:41:01.595129
3,15,ckn_alfredo_l,The Chicken Alfredo Pizza,Chicken,L,20.75,2025-11-21 23:41:01.595129
4,16,cali_ckn_l,The California Chicken Pizza,Chicken,L,20.75,2025-11-21 23:41:01.595129


In [22]:
# Concatenar o novo dataframe com o existente
df_final = pd.concat([df_dim_atual, df_dim_pizza])

# Manter apenas o registro mais recente por pizza_name_id
df_final = df_final.sort_values("data_atualizacao").drop_duplicates(
    subset=["pizza_name_id"], keep="last"
)

# Recriar surrogate key incremental
df_final = df_final.reset_index(drop=True)
df_final["sk_pizza"] = df_final.index + 1

df_final.head()


Unnamed: 0,sk_pizza,pizza_name_id,pizza_name,pizza_category,pizza_size,unit_price,data_atualizacao
0,1,spinach_supr_l,The Spinach Supreme Pizza,Supreme,L,20.75,2025-12-12 16:17:23.950440
1,2,thai_ckn_l,The Thai Chicken Pizza,Chicken,L,20.75,2025-12-12 16:17:23.950440
2,3,ital_veggie_l,The Italian Vegetables Pizza,Veggie,L,21.0,2025-12-12 16:17:23.950440
3,4,ital_supr_s,The Italian Supreme Pizza,Supreme,S,12.5,2025-12-12 16:17:23.950440
4,5,five_cheese_l,The Five Cheese Pizza,Veggie,L,18.5,2025-12-12 16:17:23.950440


In [23]:
# Comparar apenas as colunas relevantes (ignora sk_pizza e timestamp)
colunas_relevantes = ["pizza_name_id", "pizza_name", "pizza_category", "pizza_size", "unit_price"]

df_final_check = df_final[colunas_relevantes].sort_values("pizza_name_id").reset_index(drop=True)
df_dim_atual_check = df_dim_atual[colunas_relevantes].sort_values("pizza_name_id").reset_index(drop=True)

# Verificar se houve mudança
mudou = not df_final_check.equals(df_dim_atual_check)

if mudou:
    job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
    job = client.load_table_from_dataframe(df_final, table_id, job_config=job_config)
    job.result()
    print("Carga SCD1 aplicada, dados atualizados!")
else:
    print("Nenhuma alteração detectada, tabela não foi atualizada.")


Nenhuma alteração detectada, tabela não foi atualizada.


In [24]:
# Conta registros na tabela
query_check = f"SELECT COUNT(*) AS total FROM `{table_id}`"
df_check = client.query(query_check).to_dataframe()
print(df_check)




   total
0     91


In [25]:
# Criar um novo registro de pizza
#nova_pizza = pd.DataFrame([{
#    "pizza_name_id": "P999",
#    "pizza_name": "Test Pizza",
#    "pizza_category": "Especial",
#    "pizza_size": "M",
#    "unit_price": 99.90,
#    "data_atualizacao": datetime.now()
#}])

# Adicionar ao df_dim_pizza
#df_dim_pizza = pd.concat([df_dim_pizza, nova_pizza], ignore_index=True)

# Visualizar
#df_dim_pizza.tail()