# Import

In [None]:
import pandas as pd

from curation_tools.depmap_curation_tools import process_depmap
from curation_tools.curation_tools import download_file, upload_parquet_to_bq, concatenate_parquet_files

# Download

In [None]:
data_url = "https://storage.googleapis.com/depmap-external-downloads/downloads-by-canonical-id/25q2-public-557c.3/CRISPRGeneDependency.csv?response-content-disposition=attachment%3B+filename%3D%22CRISPRGeneDependency.csv%22&GoogleAccessId=depmap-external-downloads%40broad-achilles.iam.gserviceaccount.com&Expires=1759420900&Signature=QAmD1kGQLxXOPsFkbY6UGFeULn%252FWEdoscywsoA3GyQ%252F1ZJZ6oP4Wx31WelMVEo%252BU5H3XV91cFtYKEATr9yp7pMRUWtQXCSjiKNipm4L6mcg4q1s6%252Ftb6f7QHn53vjOL4gs%252BDIZYaY13gMEo3s6V2ISCm%252BASbNKEmnOZyTkZeB89IUy6t7%252B9dwIHF3mDR5n1y%252FENCIro%252BjIP3eUiO8YPBQCn0JRzIebqSOJx4sokrSNl6zS2wrTjuOwzMcNJZj9bDnf87Px0QRpo%252Ft3ZbMXUyZLqn%252BdgVbZ4oKggtV4giPWUp4pTcLjoBI9kKxGD55DE9j%252Fpxxigu6sZUds1pJOj1sg%3D%3D&userProject=broad-achilles"
metadata_url = "https://storage.googleapis.com/depmap-external-downloads/downloads-by-canonical-id/public-25q2-c5ef.75/Model.csv?response-content-disposition=attachment%3B+filename%3D%22Model.csv%22&GoogleAccessId=depmap-external-downloads%40broad-achilles.iam.gserviceaccount.com&Expires=1759420682&Signature=Vk9WFcawtXDhFsxnLhcbM7WkQolkummb%252FgJMpowL8%252BbaHr%252FMRMOgWch1kZRZ3pSkq2pVMi%252Bywypw7Glf47qqRnoBWp5EMg5gttX5hQ8dN6rY8PTyhSzpv2jJONNeZwsarUeAeaiiH7bANRKww9dN%252BwyEVI3tvn9XF1AkdHKPd55%252F3Rz%252FQt8nPiqKB8Bn9ugX3NCK6PZZAwDbU8q5sJfi7aeRiBSLo0PEavw5mFTaQT89KEjwiEHbZR3842U5pDBkI59YQlwCJrgpB79TOWw%252FP97rxnKgd7c08ie5Lzj8b%252BBC8vE95rC%252Fqf2cHXrp%252Bwvu73sKg8VcJ8ukTUkNzsSj7A%3D%3D&userProject=broad-achilles"
metadata_days_library_url = "https://storage.googleapis.com/depmap-external-downloads/downloads-by-canonical-id/25q3-public-6202.1/ScreenSequenceMap.csv?response-content-disposition=attachment%3B+filename%3D%22ScreenSequenceMap.csv%22&GoogleAccessId=depmap-external-downloads%40broad-achilles.iam.gserviceaccount.com&Expires=1759834961&Signature=SEa%252F93hQJwts5Q1rbTQtV4hjQwuPj%252BkDUBDCVF%252BdxeN%252FefFoBs739Y2tk9mVXmBrXXmFP0SFp5pbMyVnDXLpwTEQ3OG1w8sPCec8TE3iqbVY52zlx2A6JqnJ0RuNqVdvn%252BEIsfRsGKNDI77gDZcuQF%252FVZM9%252BK%252BT04iZeAUHgdS1Jt0UmHC2D1APIzjIEKfAeUHt0tEmwHw5HMo6s9UqVdDJEQx75pmAvZeDoQDeYubiRja2oOIDRrTDoizB7KZQOyZoBpbSgItLDJrvwpLYMnDXxs8BiXwdipZ1YBDC4zU71y1Aph2tmqzgD63tCHVmBYRvYARd9nHeOSfd5E6RHOw%3D%3D&userProject=broad-achilles"

data_path = "../supplementary/depmap/CRISPRGeneDependency.csv"
metadata_path = "../supplementary/depmap/Model.csv"
metadata_days_library_path = "../supplementary/depmap/ScreenSequenceMap.csv"

Download data and metadata files

In [None]:
download_file(metadata_url, metadata_path, overwrite=False)
download_file(data_url, data_path, overwrite=False)
download_file(metadata_days_library_url, metadata_days_library_path, overwrite=False)

Read data and metadata files

In [None]:
depmap_data = pd.read_csv(data_path, index_col=0).T
depmap_metadata = pd.read_csv(metadata_path)
depmap_metadata_days_library = pd.read_csv(metadata_days_library_path)

In [None]:
depmap_metadata_days_library = depmap_metadata_days_library[["ModelID", "Days", "Library"]].drop_duplicates()
depmap_metadata = depmap_metadata.merge(depmap_metadata_days_library, on="ModelID", how="left")

In [None]:
for line_id in depmap_data.columns:
    print(f"ℹ️ Processing: {line_id}")
    process_depmap(
        depmap_dataset_id=line_id,
        depmap_data=depmap_data,
        depmap_metadata=depmap_metadata
    )

# Concatenate all parquet files and save

In [None]:
parquet_dir = "../curated/parquet"
output_path_metadata = "../curated/parquet/depmap_all_curated_metadata.parquet"
output_path_data = "../curated/parquet/depmap_all_curated_data.parquet"

### Metadata

In [None]:
concatenate_parquet_files(
    parquet_dir=parquet_dir,
    output_path=output_path_metadata,
    pattern="depmap_*_curated_metadata.parquet"
)

### Data

In [None]:
concatenate_parquet_files(
    parquet_dir=parquet_dir,
    output_path=output_path_data,
    pattern="depmap_*_curated_data.parquet"
)

# Upload to BQ

In [None]:
bq_dataset_id = "prj-ext-dev-pertcat-437314.crispr"
curated_parquet_data_path = "../curated/parquet/depmap_all_curated_data.parquet"
curated_parquet_metadata_path = "../curated/parquet/depmap_all_curated_metadata.parquet"
bq_metadata_table_name="metadata"
bq_data_table_name="data"

## Metadata

In [62]:
upload_parquet_to_bq(
    parquet_path=curated_parquet_metadata_path,
    bq_dataset_id=bq_dataset_id,
    bq_table_name=bq_metadata_table_name,
    key_columns=["dataset_id", "sample_id"]
)

Staging table: loading `.parquet` file ../curated/parquet/depmap_all_curated_metadata.parquet to prj-ext-dev-pertcat-437314.crispr.metadata_staging...
Staging table: loaded 21194628 rows to prj-ext-dev-pertcat-437314.crispr.metadata_staging
Staging table: added ingested_at timestamp column to prj-ext-dev-pertcat-437314.crispr.metadata_staging
Merge completed: staging data merged into prj-ext-dev-pertcat-437314.crispr.metadata
Staging table: deleted prj-ext-dev-pertcat-437314.crispr.metadata_staging


## Data

In [52]:
upload_parquet_to_bq(
    parquet_path=curated_parquet_data_path,
    bq_dataset_id=bq_dataset_id,
    bq_table_name=bq_data_table_name,
    key_columns=["dataset_id", "sample_id"]
)

Staging table: loading `.parquet` file ../curated/parquet/depmap_all_curated_data.parquet to prj-ext-dev-pertcat-437314.crispr.data_staging...
Staging table: loaded 21194628 rows to prj-ext-dev-pertcat-437314.crispr.data_staging
Staging table: added ingested_at timestamp column to prj-ext-dev-pertcat-437314.crispr.data_staging
Merge completed: staging data merged into prj-ext-dev-pertcat-437314.crispr.data
Staging table: deleted prj-ext-dev-pertcat-437314.crispr.data_staging
