In [None]:
import gc
import sys

import polars as pl
from google.cloud import bigquery

sys.path.append("../../")
from curation_tools import upload_parquet_to_bq, create_bq_table

# Define BigQuery project and dataset details

In [None]:
dataset_name = "unified_data"
project_id = "prj-ext-dev-pertcat-437314"
dataset_id = f"{project_id}.{dataset_name}"

In the cells below, provide the `parquet_file_path`, `table_name`.

`parquet_file_path` is the path to the parquet file you want to upload to BigQuery.
`table_name` is the name of the table you want to create in BigQuery.

For metadata, 4 tables are created:
- `test_metadata_pc` - with partitioning and clustering
- `test_metadata_p` - partitioning only
- `test_metadata_c` - clustering only
- `test_metadata_none` - no partitioning or clustering

Since the tables already exist in BigQuery, this code shouldn't be run again, unless you delete the tables first (see last cell).

To upload data to existing tables, you can use the `upload_parquet_to_bq`. This will upload data to a temporary table with appended suffix `_staging`, and, if the data is unique, it will be merged into the main table. This prevents accidentally uploading the same data multiple times.

In [None]:
parquet_file_path = "../CRISPR/curated/parquet/biogrid_1161_curated_metadata.parquet"

table_name = "metadata"

df = pl.scan_parquet(parquet_file_path)
schema = df.schema
schema

## Create tables in BigQuery

In [None]:
# Partitioning and clustering
create_bq_table(
    project_id=project_id,
    dataset_name=dataset_name,
    table_name=table_name,
    schema=schema,
    partition_column=None,
    cluster_columns=['dataset_id', 'sample_id', 'perturbed_target_symbol']
)

## Upload parquet files to BigQuery

This uploads the parquet files to BigQuery tables.

In [None]:
upload_parquet_to_bq(
    parquet_path=parquet_file_path,
    bq_dataset_id=dataset_id,
    bq_table_name=table_name,
    key_columns=['dataset_id', 'sample_id']
)

# Delete tables in BigQuery

In [None]:
# list of tables to delete
tables_to_delete = [
    # "test_metadata_pc",
    # "test_metadata_p",
    # "test_metadata_c",
    # "test_metadata_none",
    # "test_metadata_pc_staging",
    # "test_metadata_p_staging",
    # "test_metadata_c_staging",
    # "test_metadata_none_staging"
]

client = bigquery.Client()

for table_name in tables_to_delete:
    table_id = f"{dataset_id}.{table_name}"
    try:
        client.delete_table(table_id)
        print(f"Deleted table {table_id}.")
    except Exception as e:
        print(f"Failed to delete table {table_id}: {e}")