In [1]:
# Step 1: Download ClinVar Mutation Dataset
import requests, gzip, pandas as pd, os, json

clinvar_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
clinvar_file = "variant_summary.txt.gz"

if not os.path.exists(clinvar_file):
    response = requests.get(clinvar_url)
    with open(clinvar_file, "wb") as f:
        f.write(response.content)
else:
    print("File already exists.")


In [1]:
# Step 2: Load and Inspect the Dataset in Chunks
import gzip
import pandas as pd

clinvar_file = "variant_summary.txt.gz"
chunk_size = 100000  # Define a suitable chunk size

# Initialize an empty list to store processed chunks
chunk_list = []

with gzip.open(clinvar_file, 'rt') as f:
    # Read the header separately to use with chunks
    header = f.readline().strip().split('\t')
    # Read the rest of the file in chunks
    for chunk in pd.read_csv(f, sep='\t', dtype={'18': str}, low_memory=False, chunksize=chunk_size, header=None):
        # Assign column names from the header
        chunk.columns = header
        chunk_list.append(chunk)

# Concatenate the processed chunks if needed later, but we will process chunks in Step 3
# data = pd.concat(chunk_list, ignore_index=True)

print("Data loaded in chunks.")
# We won't print columns here as 'data' as a whole is not loaded

Data loaded in chunks.


In [2]:
#  Step 3: Filter for BRCA1/BRCA2 Breast Cancer Mutations (Processing Chunks)
import pandas as pd

# Ensure chunk_list is available from Step 2
if 'chunk_list' not in locals():
    print("Error: chunk_list not found. Please run Step 2 first.")
else:
    bc_data_list = []
    for data_chunk in chunk_list:
        # Apply the filtering criteria to each chunk
        bc_chunk = data_chunk[
            (data_chunk['GeneSymbol'].isin(['BRCA1', 'BRCA2'])) &
            (data_chunk['PhenotypeList'].str.contains("Breast", na=False))
        ]

        # Apply the MinorAlleleFreq filter if the column exists
        if 'MinorAlleleFreq' in bc_chunk.columns:
             bc_chunk = bc_chunk[bc_chunk['MinorAlleleFreq'].fillna(0).astype(float) <= 0.01]


        bc_data_list.append(bc_chunk)

    # Concatenate the filtered chunks into a single DataFrame
    bc_data = pd.concat(bc_data_list, ignore_index=True)

    bc_data.to_csv("filtered_mutations.csv", index=False)
    print("Filtered mutations saved to filtered_mutations.csv")

Filtered mutations saved to filtered_mutations.csv


In [19]:
#  Step 4: Prepare Mutation Payload for Ensembl VEP and Iterate
import requests
import json
import time

# Use the first few variants from the filtered data for testing
variants_to_try = 5
annotated_variant = None

for i in range(min(variants_to_try, len(bc_data))):
    example_variant = bc_data.iloc[i]

    # Construct the variant ID in VCF-like format (chromosome:position:reference:alternate)
    # Using PositionVCF, ReferenceAlleleVCF, and AlternateAlleleVCF columns
    variant_id = str(example_variant['Chromosome']) + ":" + str(example_variant['PositionVCF']) + ":" + str(example_variant['ReferenceAlleleVCF']) + ":" + str(example_variant['AlternateAlleleVCF'])

    # Mutation dictionary format for VEP ID endpoint
    mutation = {
        "ids": [variant_id], # Use 'ids' for the /id endpoint
        "assembly": "GRCh37" # Specify the assembly to match the input data
    }

    # Change the VEP API endpoint to /vep/human/id
    vep_url = "https://rest.ensembl.org/vep/human/id"
    headers = {"Content-Type": "application/json"}

    print(f"Attempting to annotate variant {i+1}: {variant_id}")

    # Send to Ensembl VEP API
    response = requests.post(vep_url, headers=headers, data=json.dumps(mutation))

    if response.status_code == 200:
        annotation = response.json()
        if annotation: # Check if the annotation list is not empty
            print(f"Successfully annotated variant {i+1}.")
            annotated_variant = annotation[0] # Store the first successful annotation
            break # Stop iterating on success
        else:
            print(f"Variant {i+1} found, but no annotation returned.")
    elif response.status_code == 400:
        print(f"VEP failed for variant {i+1}: {response.status_code} - Bad Request.")
        try:
            error_details = response.json()
            print(f"Error details: {error_details}")
        except json.JSONDecodeError:
            print(f"Response text: {response.text}")
    else:
        print(f"VEP failed for variant {i+1}: {response.status_code}")
        print(f"Response text: {response.text}")

    time.sleep(1) # Add a small delay between requests

if annotated_variant:
    print("\nFound a successfully annotated variant.")
    # You can now use annotated_variant for further steps
    # For example, print the keys in the annotation:
    # print(annotated_variant.keys())
else:
    print("\nCould not successfully annotate any of the first few variants.")

Attempting to annotate variant 1: 13:32921028:CTTTCGG:C
VEP failed for variant 1: 400 - Bad Request.
Error details: {'error': "No variant found with ID '13:32921028:CTTTCGG:C'"}
Attempting to annotate variant 2: 13:32346891:CTTTCGG:C
VEP failed for variant 2: 400 - Bad Request.
Error details: {'error': "No variant found with ID '13:32346891:CTTTCGG:C'"}
Attempting to annotate variant 3: 13:32914766:CTT:C
VEP failed for variant 3: 400 - Bad Request.
Error details: {'error': "No variant found with ID '13:32914766:CTT:C'"}
Attempting to annotate variant 4: 13:32340629:CTT:C
VEP failed for variant 4: 400 - Bad Request.
Error details: {'error': "No variant found with ID '13:32340629:CTT:C'"}
Attempting to annotate variant 5: 13:32915082:CTG:C
VEP failed for variant 5: 400 - Bad Request.
Error details: {'error': "No variant found with ID '13:32915082:CTG:C'"}

Could not successfully annotate any of the first few variants.


In [20]:
# Step 5: Save the Successfully Annotated Variant (if found)
import json

# Ensure annotated_variant is available from Step 4
if 'annotated_variant' in locals() and annotated_variant is not None:
    with open("vep_annotation.json", "w") as f:
        json.dump(annotated_variant, f, indent=4)
    print("Successfully annotated variant saved to vep_annotation.json")
else:
    print("No successfully annotated variant found in Step 4. Skipping Step 5.")

No successfully annotated variant found in Step 4. Skipping Step 5.


In [21]:
# Step 6: Simulate Pathogenicity Scores
bc_data["PolyPhen"] = [0.9] * len(bc_data)
bc_data["SIFT"] = [0.01] * len(bc_data)

# High = damaging
# Low = damaging


In [22]:
# Step 7: Filter Pathogenic Mutations
pathogenic = bc_data[
    (bc_data["PolyPhen"] > 0.85) &
    (bc_data["SIFT"] < 0.05)
]

pathogenic.to_csv("pathogenic_mutations.csv", index=False)


In [23]:
# Step 8: Download Results (Google Colab Only)
try:
    from google.colab import files
    files.download("filtered_mutations.csv")
    files.download("pathogenic_mutations.csv")
    files.download("vep_annotation.json")
except:
    print("Download only works in Google Colab.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download only works in Google Colab.


In [24]:
# Step 9: List Files in Current Directory
import os
print(os.listdir())


['.config', 'pathogenic_mutations.csv', 'variant_summary.txt.gz', 'filtered_mutations.csv', 'sample_data']


**Detailed Final Interpretation of Findings:**

*   The initial ClinVar mutation dataset was successfully downloaded and processed efficiently by reading it in chunks to manage memory usage.
*   The data was then filtered to specifically identify mutations in the BRCA1 and BRCA2 genes that are associated with a "Breast" phenotype and have a minor allele frequency (MAF) of 0.01 or less.
*   This filtering process resulted in a dataset of potentially relevant mutations, which was saved to `filtered_mutations.csv`.
*   An attempt was made to obtain detailed functional annotations and predicted pathogenicity scores (like PolyPhen and SIFT) for these filtered variants using the Ensembl VEP API.
*   However, the VEP annotation step was unsuccessful because the specific variants from the ClinVar dataset could not be found in the Ensembl VEP database using the provided identifiers and the GRCh37 assembly.
*   Due to the failure to obtain real pathogenicity scores from VEP, placeholder scores for PolyPhen and SIFT were simulated for all filtered variants.
*   Using these *simulated* pathogenicity scores, the filtered mutations were further analyzed to identify those classified as "pathogenic" based on predefined thresholds (PolyPhen > 0.85 and SIFT < 0.05).
*   The list of mutations classified as pathogenic based on these simulated scores was saved to `pathogenic_mutations.csv`.
*   The generated files (`filtered_mutations.csv` and `pathogenic_mutations.csv`) containing the filtered and simulated pathogenic mutation lists are available for download.
*   Note that the pathogenicity classification in `pathogenic_mutations.csv` is based on *simulated* scores due to the VEP annotation failure and may not reflect actual predicted or clinically determined pathogenicity.

**Conclusion of the Workflow:**

*   We successfully downloaded a large ClinVar mutation dataset.
*   The dataset was loaded and processed efficiently in chunks to manage memory.
*   We filtered the data to identify BRCA1/BRCA2 mutations related to breast cancer with low minor allele frequency.
*   An attempt was made to get functional annotations from the Ensembl VEP API, but the specific variants could not be found in the database.
*   Due to the VEP annotation failure, pathogenicity scores were simulated for the filtered variants.
*   We then filtered the mutations based on these simulated scores to identify potentially pathogenic variants.
*   Finally, the filtered and "pathogenic" mutation lists (based on simulated data) were saved to CSV files.