### Preprocessing of BioPAX files

Need : 
- concatenation of independant pathway files into a single OWL file 
- correction of invalid URIs : 
    - that contain spaces inside
    - that contain brackets
    - that contain spaces or tabs at the end
- writing of a correction version of file 

In [3]:
import os
import glob
from requests.utils import requote_uri
from urllib.parse import quote
import re

In [31]:
def concatenate_owl_files(input_files, output_file):
    """
    Concatenates multiple OWL files into a single OWL file with the specified header.
    Ensures valid XML structure and adds line breaks after each file's content.

    Parameters:
    input_files (list of str): List of paths to the OWL files.
    output_file (str): Path to the output concatenated OWL file.
    """
    with open(output_file, 'w') as outfile:
        # Write the XML declaration and the initial RDF element with namespaces
        outfile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        outfile.write('<rdf:RDF\n')
        outfile.write(' xmlns="https://pantherdb.org/pathways/biopax/P04396#"\n')
        outfile.write(' xmlns:xsd="http://www.w3.org/2001/XMLSchema#"\n')
        outfile.write(' xmlns:owl="http://www.w3.org/2002/07/owl#"\n')
        outfile.write(' xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"\n')
        outfile.write(' xmlns:bp="http://www.biopax.org/release/biopax-level3.owl#"\n')
        outfile.write(' xml:base="https://pantherdb.org/pathways/biopax/P04396#">\n')
        outfile.write('<owl:Ontology rdf:about="">\n')
        outfile.write(' <owl:imports rdf:resource="http://www.biopax.org/release/biopax-level3.owl#" />\n')
        outfile.write('</owl:Ontology>\n')

        for file in input_files:
            with open(file, 'r') as infile:
                lines = infile.readlines()

                for line in lines:
                    stripped_line = line.strip()

                    # Skip XML declarations and specific header lines from individual files
                    if stripped_line.startswith('<?xml') or stripped_line.startswith('<rdf:RDF'):
                        continue
                    if stripped_line.startswith('<owl:Ontology') or stripped_line.startswith('<owl:imports'):
                        continue

                    # Write valid content to the output file
                    outfile.write(line)

                # Add a line break after each file's content
                outfile.write('\n')

        # Closing the single <rdf:RDF> element
        outfile.write('</rdf:RDF>\n')

In [32]:
current_directory = os.getcwd()

PantherBioPAX_file_path = os.path.join(current_directory, 'BioPAX_Data/PantherBioPAX/BioPAX')
Panther_owl_files = glob.glob(os.path.join(PantherBioPAX_file_path, '*.owl'))

concatenate_owl_files(Panther_owl_files, "BioPAX_Data/PantherBioPAX/BioPAX/Panther_standalone_biopax_concatenated.owl")

In [33]:
def correct_invalid_uris(owl_file, output_file):
    """
    Reads the concatenated OWL file, identifies invalid URIs, corrects them conditionally, and writes a new file.

    - If a URI contains whitespace or tabulation, it is corrected using requote_uri.
    - If a URI contains brackets ([]), it is corrected using urllib.parse.quote.

    Parameters:
    owl_file (str): Path to the OWL file to check for invalid URIs.
    output_file (str): Path to write the corrected OWL file.
    """
    # Regular expression to find strings between double quotes
    quote_pattern = re.compile(r'"([^"]*)"')
    
    # Define different sets of invalid characters
    whitespace_or_tab = r'[\s]'  # matches spaces or tabs
    brackets = r'[\[\]]'  # matches square brackets
    
    with open(owl_file, 'r') as infile:
        lines = infile.readlines()

    corrected_lines = []
    invalid_uris = []

    # Iterate through each line of the file to find strings between quotes
    for line_num, line in enumerate(lines, 1):
        # Find all strings between quotes
        quoted_strings = quote_pattern.findall(line)
        corrected_line = line

        for uri_value in quoted_strings:
            corrected_uri = uri_value  # Initially assume the URI is valid

            # Check if the URI contains whitespace or tab characters
            if re.search(whitespace_or_tab, uri_value):
                invalid_uris.append((line_num, uri_value, 'whitespace or tab'))
                # Correct the URI using requote_uri (for spaces/tabs)
                corrected_uri = requote_uri(uri_value)
            
            # Check if the URI contains square brackets
            elif re.search(brackets, uri_value):
                invalid_uris.append((line_num, uri_value, 'brackets'))
                # Correct the URI using quote (for brackets)
                corrected_uri = quote(uri_value, safe=":/#")

            # If the URI was corrected, replace it in the line
            if corrected_uri != uri_value:
                corrected_line = corrected_line.replace(f'"{uri_value}"', f'"{corrected_uri}"')

        # Add the corrected line to the list
        corrected_lines.append(corrected_line)
    
    # Write the corrected lines to a new output OWL file
    with open(output_file, 'w') as outfile:
        outfile.writelines(corrected_lines)

    # Print the invalid URIs and their corrections
    if invalid_uris:
        print("Invalid URIs found and corrected:")
        for line_num, uri, issue in invalid_uris:
            corrected = requote_uri(uri) if issue == 'whitespace or tab' else quote(uri, safe=":/#")
            print(f"Line {line_num}: {uri} -> {corrected} (issue: {issue})")
    else:
        print("No invalid URIs found.")



# Example usage
correct_invalid_uris("BioPAX_Data/PantherBioPAX/BioPAX/Panther_standalone_biopax_concatenated.owl", "BioPAX_Data/PantherBioPAX/BioPAX/Panther_standalone_biopax_concatenated_corrected.owl")

Invalid URIs found and corrected:
Line 168237: _[GnRH_GnRHR]_gnas_s808_csa54_ -> _%5BGnRH_GnRHR%5D_gnas_s808_csa54_ (issue: brackets)
Line 461286: _[GnRH_GnRHR]_gnas_s808_csa54_ -> _%5BGnRH_GnRHR%5D_gnas_s808_csa54_ (issue: brackets)
