In [1]:
pip install google-cloud-bigquery pandas-gbq



In [1]:
from google.cloud import bigquery
import pandas as pd
import os

## Enviroment Configuration

In [6]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"D:\GAMIC\PORTFOLIO\CLOTHING RETAIL - PRS POWERED BY LLM\credentials.json"
project_id = "clothing-retail-prs-llm"
dataset_id = "fashion_retail_dataset"
csv_folder = r"D:\GAMIC\PORTFOLIO\CLOTHING RETAIL - PRS POWERED BY LLM\01_Clothing_Retail_Synthetic_Data_Creation"
location = "us-central1"  # Ajusta tu región


## Client Start

In [7]:
# Inicializa el cliente
client = bigquery.Client(project="clothing-retail-prs-llm", location="us-central1")

## Creation of Datasets in BigQuery

In [8]:
dataset_ref = client.dataset(dataset_id)
try:
    client.get_dataset(dataset_ref)
    print("Dataset ya existe.")
except Exception:
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = location
    client.create_dataset(dataset)
    print(f"Dataset {dataset_id} creado.")

Dataset ya existe.


## Load tables from CSV

In [18]:
tables = {
    "customers": "customers.csv",          # Archivo en csv_folder/customers.csv
    "products": "products.csv",            # Archivo en csv_folder/products.csv
    "interactions": "interactions.csv",    # ... y así sucesivamente
    "transactions": "transactions.csv",
    "inventory_history": "inventory_history.csv",
    "customer_segments": "customer_segments.csv"
}

In [None]:
for table_name, csv_file in tables.items():
    # Construir ruta completa al archivo CSV
    full_path = os.path.join(csv_folder, csv_file)  # <-- Aquí se combinan las rutas
    
    # Configurar job de carga
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,
        autodetect=True,
        write_disposition="WRITE_TRUNCATE"
    )
    
    # Cargar archivo
    with open(full_path, "rb") as source_file:  # <-- Usa full_path
        job = client.load_table_from_file(
            source_file,
            dataset_ref.table(table_name),
            job_config=job_config
        )
    
    job.result()
    print(f"Table {table_name} load. Rows: {job.output_rows}")

Tabla customers cargada. Filas: 5000
Tabla products cargada. Filas: 1500
Tabla interactions cargada. Filas: 100000
Tabla transactions cargada. Filas: 15000
Tabla inventory_history cargada. Filas: 85500
Tabla customer_segments cargada. Filas: 5000


## Partition by purchase_date

In [None]:
job_config.time_partitioning = bigquery.TimePartitioning(
    type_=bigquery.TimePartitioningType.DAY,
    field="purchase_date"
)

## Compare quantity of rows between CSV and BiQuery

In [None]:

for table_name, csv_file in tables.items():

    csv_rows = sum(1 for line in open(f"{csv_folder}/{csv_file}")) - 1  # Resta el header
    
    query = f"SELECT COUNT(*) AS total FROM {dataset_id}.{table_name}"
    bq_rows = list(client.query(query))[0]['total']
    
    assert csv_rows == bq_rows, f"Error on {table_name}: CSV={csv_rows} vs BQ={bq_rows}"