In [7]:
!pip install -q Llama-parse
!pip install -q llama-index
!pip install -q nest-asyncio
!pip install -q pdfminer.six

In [53]:
from pdfminer.high_level import extract_text
#from llama_parse import LlamaParse
import os
import nest_asyncio
import pandas as pd
import re
import csv
from io import StringIO


#### 1. To grab all the .pdf files from the given directory

In [61]:
def process_pdfs_in_directory(dir_path):
    """
    Processes the specified number of .pdf files in the given directory by parsing and saving them as CSVs.
    Args:
    dir_path (str): The path to the directory containing .pdf files.
    
    """
    files = os.listdir(dir_path)
    list_files = []
    pdf_files = [f for f in files if f.endswith('.pdf')]
    pdf_count = 0
    for pdf_file in pdf_files:
        pdf_count+=1
        file_path = os.path.join(dir_path, pdf_file)
        file_name = os.path.splitext(pdf_file)[0]
        list_files += [(file_path, file_name)]
    print("Total num of pdfsS: ", pdf_count)
    return list_files

#### 2. Extract the text from a given pdf (including removing footer text)

In [62]:
def extract_text_from_pdf(pdf_path, start_marker, end_marker):
    text = extract_text(pdf_path)
    start_pos = text.find(start_marker)
    end_pos = text.find(end_marker, start_pos)
    if start_pos == -1 or end_pos == -1:
        return None
    extracted_text = text[start_pos:end_pos + len(end_marker)]
    cleaned_text = re.sub(r'.*Vidium.*Toll-Free: 833-794-0318.*\n', '', extracted_text)
    cleaned_text = re.sub(r'Page\s+\d+\s+of\s+\d+\s*\n', '', cleaned_text)
    cleaned_text = re.sub(r'SearchLight DNA Clinician Report\s*', '', cleaned_text)
    cleaned_text = re.sub(r'Detailed Summary:\n', '', cleaned_text)
    cleaned_text = re.sub(r'Please see Link for a detailed summary of this gene as well as information regarding this variant and its associated canine and human data.\n', '', cleaned_text)
    return cleaned_text

#### 3. Convert the text into sensible chunks 

In [63]:
def text_chunking(text):
    if text is None: 
        return None 
    gene_sections = text.split("\n\n")
    chunks = []
    chunk = ""
    marker_pattern = r"Variant Summary:\n\n.*?\n\n"
    link_info_pattern = r'Please see Link for a detailed summary of this gene as well as information regarding this variant and its associated \ncanine and human data.'
    for section in gene_sections:
        if section.strip():  
            chunk += section + "\n\n"
            marker_match = re.search(marker_pattern, chunk, re.DOTALL)
            if marker_match:
                end_marker_index = marker_match.end()
                cleaned_chunk = re.sub(link_info_pattern, '', chunk[:end_marker_index].strip(), flags=re.DOTALL)
                
                chunks.append(cleaned_chunk.strip())
                chunk = chunk[end_marker_index:].strip() + "\n\n"
    if chunk.strip():
        chunks.append(chunk.strip())
    return chunks

#### 4. Extract mutation summary information from the given text chunk  

In [64]:
def extract_gene_info(chunk):
    #print("Input Chunk:\n", chunk)
    #print("\n----------------------------------------------\n")
    #gene
    gene_match = re.search(r"Gene\s*\n\s*(.*?)\n", chunk, re.DOTALL)
    if not gene_match:
        return None
    gene = gene_match.group(1).strip()
    gene = {'gene':gene,'mutation':'', 'roles':'', 'variant_summary':''}
    # Mutation
    mutation_match = re.search(r"Mutation\s*\n\s*(.*?)\n", chunk, re.DOTALL)
    if mutation_match:
        mutation = mutation_match.group(1).strip()
        gene['mutation'] = mutation
        #print(f"EXTRACTED Mutation: {mutation}")
    
    # Roles in the case
    roles_match = re.search(r"Roles in this case:\s*\n\s*(.*?)\s*Variant Summary:", chunk, re.DOTALL)
    if roles_match:
        roles = roles_match.group(1).strip()
        #print(f"EXTRACTED Roles in this case: {roles}")
    else:
        roles = ""
    gene['roles'] = roles
    
    variant_match = re.search(r"Variant Summary:\s*\n\s*(.*?)(?=\n\n|$)", chunk, re.DOTALL)
    if variant_match:
        variant_summary = variant_match.group(1).strip()
        #print(f"EXTRACTED Variant Summary: {variant_summary}")
    else:
        variant_summary = ""
    gene['variant_summary'] = variant_summary
    return gene




#### 5. Creating the .csv for current file being processed

In [65]:
def mutation_csv(gene_dictionary, file_name):
    output_dir = f"{file_name}_clinical_report"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    csv_file_path = os.path.join(output_dir, f"{file_name}_mutsumm.csv")
    headers = ['gene', 'mutation', 'roles', 'variant_summary']
    
    with open(csv_file_path, mode='w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=headers)
        writer.writeheader()
        for key, value in gene_dictionary.items():
            writer.writerow(value)
    print(f"CSV file created at: {csv_file_path}\n")
    output_file = file_name.replace(".csv", "_with_cancers.csv")
    df.to_csv(output_file, index=False)
    return df

# Example usage
file_name = "your_input_file.csv"
df_with_cancers = ner_cancer(file_name)
print(df_with_cancers)
    

#### 6. NER cancer types 

In [None]:
def ner_cancer(file_name):
    df = pd.read_csv(file_name)
    if 'variant_summary' not in df.columns:
        raise ValueError("The input file must contain a 'variant_summary' column.")
    def extract_cancers(text):
        doc = nlp(text)
        cancers = [ent.text for ent in doc.ents if ent.label_ == 'DISEASE']
        return cancers

    df['cancers'] = df['variant'].apply(lambda x: extract_cancers(x) if pd.notnull(x) else [])

    
    
    
    

In [67]:
print("DONE!!!")

DONE!!!


In [66]:
output_dir = 'Searchlight Info/SearchlightReports'
file_list = process_pdfs_in_directory(output_dir)

start_marker = "Mutation Summaries"
end_marker = "Clinical Trials Summary"

for i in file_list: 
    f_path = i[0]
    f_name = i[1]
    print('file name: ', f_name, " file_path: ", f_path, " \n")
    pdf_text_between_markers = extract_text_from_pdf(f_path, start_marker, end_marker)
    mutation_chunks = text_chunking(pdf_text_between_markers)
    temp_gene_dictionary = {}
    gene_counter = 0
    if mutation_chunks is None:
        print(f"File {f_name} at {f_path} skipped (lack of information).")
    else:
        temp_gene_dictionary = {}
        gene_counter = 0
        for i, chunk in enumerate(mutation_chunks):
            #print(f"Chunk {i + 1}:-\n{chunk}\n")
            #print("\n---------------------------------------\n")
            dictionary_gene = extract_gene_info(chunk)
            if dictionary_gene is not None: 
                temp_gene_dictionary[gene_counter] = dictionary_gene
                gene_counter += 1
        mutation_csv(temp_gene_dictionary, f_name)

Total num of pdfsS:  1012
file name:  SL23-000862_1_371  file_path:  Searchlight Info/SearchlightReports/SL23-000862_1_371.pdf  

CSV file created at: SL23-000862_1_371_clinical_report/SL23-000862_1_371_mutsumm.csv

file name:  SL23-000579_1_371  file_path:  Searchlight Info/SearchlightReports/SL23-000579_1_371.pdf  

CSV file created at: SL23-000579_1_371_clinical_report/SL23-000579_1_371_mutsumm.csv

file name:  SL23-000609_1_371  file_path:  Searchlight Info/SearchlightReports/SL23-000609_1_371.pdf  

CSV file created at: SL23-000609_1_371_clinical_report/SL23-000609_1_371_mutsumm.csv

file name:  SL23-000924_1_371  file_path:  Searchlight Info/SearchlightReports/SL23-000924_1_371.pdf  

CSV file created at: SL23-000924_1_371_clinical_report/SL23-000924_1_371_mutsumm.csv

file name:  SL24-000182_1_371  file_path:  Searchlight Info/SearchlightReports/SL24-000182_1_371.pdf  

CSV file created at: SL24-000182_1_371_clinical_report/SL24-000182_1_371_mutsumm.csv

file name:  SL23-001055_