# Import

In [None]:
from pathlib import Path
import pandas as pd

from curation_tools.biogrid_curation_tools import (
    process_biogrid_screen,
    compare_metadata_biogrid
)

top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



üìù Define the id of the Biogrid paper and paths/directories

In [None]:
gc_fname = "25307932" # PMID of the Biogrid paper

data_dir = Path("../supplementary/temp/biogrid/data")
biogrid_metadata_path = Path("../../text_extraction/biogrid_curated_metadata.csv")
curated_gemini_dir = Path("../../text_extraction/curated_gemini")
non_curated_h5ad_dir = Path("../non_curated/h5ad")
curated_parquet_dir = Path("../curated/parquet")
crispr_studies_path = Path("../crispr_studies.xlsx")

# BigQuery dataset id
bq_dataset_id = "prj-ext-dev-pertcat-437314.crispr"

Read metadata files

In [4]:
gc_metadata_df = pd.read_json(curated_gemini_dir / f"{gc_fname}.json")
gc_metadata_df = gc_metadata_df.set_index("gemini_" + gc_metadata_df.index.astype(str))
bg_metadata_all = pd.read_csv(biogrid_metadata_path)
bg_metadata_df = bg_metadata_all[bg_metadata_all["SOURCE_ID"] == gc_fname]
all_screen_ids = bg_metadata_df["#SCREEN_ID"].values

crispr_studies_df = pd.read_excel(crispr_studies_path)

print(f"Screen ids in the study:\n{all_screen_ids}")

Screen ids:
[   5    6 1161 1162]


üìù Select the `screen_id` to analyse from above

üìù Define `index_prefix`

In [5]:
biogrid_screen_id = "biogrid_5" # biogrid screen id to analyse - select from above
gemini_id = "gemini_2" # corresponding gemini id - examine the comparison table below to identify the correct gemini id

Create the comparison table between Biogrid and Gemini-curated metadata

**Manually examine the potential discrepancies to validate the curation process!**

In [6]:
compare_metadata_biogrid(
    bg_metadata_df=bg_metadata_df,
    gc_metadata_df=gc_metadata_df,
    biogrid_screen_id=biogrid_screen_id,
    gemini_id=gemini_id # gemini id - None to see all gemini entries
)

Use the data from the comparison table, biogrid page of the screen and the original study to update the `crispr_studies.xlsx` file.

Then run the cell below to process the Biogrid screen and upload to BigQuery.

In [6]:
for dataset_id in crispr_studies_df['dataset_id']:
    print(f"Processing dataset_id: {dataset_id}")
    biogrid_screen_path = (data_dir / f"BIOGRID-ORCS-SCREEN_{dataset_id.lstrip('biogrid_')}-1.1.17.screen.tab.txt")
    curated_metadata_dict = crispr_studies_df[crispr_studies_df['dataset_id'] == dataset_id].to_dict(orient='records')[0]

    process_biogrid_screen(
        biogrid_dataset_id=dataset_id,
        biogrid_screen_path=biogrid_screen_path,
        biogrid_metadata_df=bg_metadata_all,
        curated_metadata_dict=curated_metadata_dict,
        non_curated_h5ad_dir=non_curated_h5ad_dir,
        upload_to_bq=True, # set to True to upload to BigQuery
        bq_dataset_id=bq_dataset_id,
        bq_metadata_table_name="metadata",
        bq_data_table_name="data"
    )


Processing dataset_id: biogrid_1161
Processing dataset_id: biogrid_5
Processing dataset_id: biogrid_2362
Processing dataset_id: biogrid_2363
Processing dataset_id: biogrid_2364
Processing dataset_id: biogrid_2365
Processing dataset_id: biogrid_2366
Processing dataset_id: biogrid_2367
Processing dataset_id: biogrid_2373
Processing dataset_id: biogrid_6
Processing dataset_id: biogrid_1162
