In [1]:
%pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six)
  Downloading cryptography-43.0.0-cp39-abi3-macosx_10_9_universal2.whl.metadata (5.4 kB)
Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six)
  Downloading cffi-1.17.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (1.5 kB)
Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six)
  Downloading pycparser-2.22-py3-none-any.whl.metadata (943 bytes)
Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading cryptography-43.0.0-cp39-abi3-macosx_10_9_universal2.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading cffi-1.17.0-cp310-cp310-macosx_11_0_arm64.whl (178

In [2]:
import re
from pdfminer.high_level import extract_text

def extracting_uncertainVariants(file_path):
    try:
        # Extract the full text from the PDF
        full_text = extract_text(file_path)

        # Define the start and end headings
        start_heading = "Variants of Uncertain Significance\n"
        end_heading = "References\n"

        # Find the positions of the start and end headings
        start_idx = full_text.find(start_heading)
        end_idx = full_text.find(end_heading)

        # Extract the text between the start and end headings
        if start_idx != -1 and end_idx != -1 and start_idx < end_idx:
            extracted_text = full_text[start_idx:end_idx]
        else:
            extracted_text = "Specified section not found or incorrect order."

        # Print the extracted text for debugging
        #print(extracted_text)

        # Regular expression to match all types of variants including gene names
        pattern = r'•\s+([A-Za-z0-9]+)\s+\((p\.[A-Za-z0-9_]+|Copy Number Gain|Copy Number Loss)\)'
        matches = re.findall(pattern, extracted_text)

        # Format the variants list with gene names and details
        uncertainVariants = [f'{match[0]} ({match[1]})' for match in matches]
        return uncertainVariants

    except Exception as e:
        print(f"Error occurred while processing {file_path}: {e}")
        return None

def main(folder_path):
    var_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing: {file_path}")
            case_number = os.path.splitext(os.path.basename(file_path))[0]
            uncertainVariants = extracting_uncertainVariants(file_path)
            if uncertainVariants is not None:
                var_dict[case_number] = uncertainVariants
            else:
                var_dict[case_number] = "Error occurred in this case"

    # Write the dictionary to a CSV file
    with open('output.csv', 'w', newline='') as csvfile:
        fieldnames = ['Case Number', 'Uncertain Variants']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for case_number, variants in var_dict.items():
            writer.writerow({'Case Number': case_number, 'Uncertain Variants': '; '.join(variants) if isinstance(variants, list) else variants})

    print("CSV file 'output.csv' created successfully.")




In [6]:
import os
import csv

In [7]:
main("/Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports")

Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL23-000862_1_371.pdf
Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL23-000579_1_371.pdf
Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL23-000609_1_371.pdf
Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL23-000924_1_371.pdf
Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL24-000182_1_371.pdf
Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL23-001055_1_371.pdf
Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL23-000187_1_370.pdf
Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL23-000464_1_370.pdf
Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL23-000673_1_371.pdf
Processing: /Users/diyasaha/Cancer_research/Searchlight Info/SearchlightReports/SL

In [None]:
import csv

def get_error_cases(file_path):
    error_cases = []
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['Uncertain Variants'] == "Error occurred in this case":
                error_cases.append(row['Case Number'])
    return error_cases

file_path = "/content/output.csv"  # Path to your CSV file
error_cases = get_error_cases(file_path)
print("Case Numbers with Errors:", error_cases)


Case Numbers with Errors: []


In [None]:
file_paths = [
    '/content/SL23-000532_1_371.pdf',
    '/content/SL23-000335_1_370.pdf',
    '/content/SL23-000336_1_370.pdf',
    '/content/SL23-000334_1_370.pdf',
]

In [None]:
for path in file_paths:
  extracted = extracting_uncertainVariants(path)
  print(extracted)

Variants of Uncertain Significance
The following variants were detected in Jed Ramirez's tumor sample. These variants are considered variants of uncertain 
significance, meaning the functional impact of the alteration on gene function is unknown or the role of the mutation in 
tumor diagnosis, prognosis, or treatment is unknown. Future research may reveal a role for the mutations in cancer.

• KMT2D (p.Gln2023Glu)

• MAPK1 (p.Ala6_Ala7del)

• MAPK1 (Copy Number Gain)

• NOTCH1 (Copy Number Loss)

• POLE (p.Gly1809Ser)

Vidium  I  Toll-Free: 833-794-0318

Page 10 of 15 

SearchLight DNA Clinician Report


['KMT2D (p.Gln2023Glu)', 'MAPK1 (p.Ala6_Ala7del)', 'MAPK1 (Copy Number Gain)', 'NOTCH1 (Copy Number Loss)', 'POLE (p.Gly1809Ser)']
Variants of Uncertain Significance
The following variants were detected in Ellie Mulligan's tumor sample. These variants are considered variants of 
uncertain significance, meaning the functional impact of the alteration on gene function is unknown or the 