In [2]:
import requests
import os

# Define the base URL for the KEGG API
BASE_URL = "http://rest.kegg.jp/"

# Function to get the list of human pathways
def get_human_pathways():
    response = requests.get(f"{BASE_URL}list/pathway/hsa")
    if response.ok:
        pathway_ids = response.text
        # The pathway identifiers are in the first column, separated by a tab character
        return [line.split("\t")[0].replace('path:', '') for line in pathway_ids.strip().split("\n")]
    else:
        print("Failed to retrieve pathway list")
        return []

# Function to download KGML files for each pathway
def download_kgml_files(pathway_ids):
    # Create a directory named 'KGML' if it doesn't exist
    os.makedirs('KGML', exist_ok=True)

    for pathway_id in pathway_ids:
        response = requests.get(f"{BASE_URL}get/{pathway_id}/kgml")
        if response.ok:
            # Save the file to the 'KGML' directory
            filename = f"KGML/{pathway_id}.kgml"
            with open(filename, 'w') as file:
                file.write(response.text)
            print(f"Downloaded {filename}")
        else:
            print(f"Failed to download KGML for {pathway_id}")

# Main function to perform the download
def main():
    pathway_ids = get_human_pathways()
    download_kgml_files(pathway_ids)

# Run the main function
if __name__ == "__main__":
    main()


Downloaded KGML/hsa01100.kgml
Downloaded KGML/hsa01200.kgml
Downloaded KGML/hsa01210.kgml
Downloaded KGML/hsa01212.kgml
Downloaded KGML/hsa01230.kgml
Downloaded KGML/hsa01232.kgml
Downloaded KGML/hsa01250.kgml
Downloaded KGML/hsa01240.kgml
Downloaded KGML/hsa00010.kgml
Downloaded KGML/hsa00020.kgml
Downloaded KGML/hsa00030.kgml
Downloaded KGML/hsa00040.kgml
Downloaded KGML/hsa00051.kgml
Downloaded KGML/hsa00052.kgml
Downloaded KGML/hsa00053.kgml
Downloaded KGML/hsa00500.kgml
Downloaded KGML/hsa00520.kgml
Downloaded KGML/hsa00620.kgml
Downloaded KGML/hsa00630.kgml
Downloaded KGML/hsa00640.kgml
Downloaded KGML/hsa00650.kgml
Downloaded KGML/hsa00562.kgml
Downloaded KGML/hsa00190.kgml
Downloaded KGML/hsa00910.kgml
Downloaded KGML/hsa00920.kgml
Downloaded KGML/hsa00061.kgml
Downloaded KGML/hsa00062.kgml
Downloaded KGML/hsa00071.kgml
Downloaded KGML/hsa00100.kgml
Downloaded KGML/hsa00120.kgml
Downloaded KGML/hsa00140.kgml
Downloaded KGML/hsa00561.kgml
Downloaded KGML/hsa00564.kgml
Downloaded

In [3]:
import csv
import os
import xml.etree.ElementTree as ET
from collections import defaultdict
from itertools import product

# Function to parse a single KGML file and extract the relevant information
def parse_kgml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    pathway_name = root.get('name').split(":")[-1]  # Assuming the pathway name is like "path:hsa04810"
    pathway_info = {'pathway': pathway_name, 'relations': defaultdict(int)}

    entries = {}
    for entry in root.findall('entry'):
        entry_id = entry.get('id')
        entities = entry.get('name').split()
        entries[entry_id] = entities

    for relation in root.findall('relation'):
        entry1 = relation.get('entry1')
        entry2 = relation.get('entry2')
        interaction_types = [subtype.get('name') for subtype in relation.findall('subtype')]

        # Generate all possible combinations of relations
        for starter, receiver in product(entries.get(entry1, []), entries.get(entry2, [])):
            pathway_info['relations'][(starter, receiver, tuple(interaction_types))] += 1

        # If the relation is reversible, add the reverse relations as well
        if 'reversible' in interaction_types:
            for starter, receiver in product(entries.get(entry2, []), entries.get(entry1, [])):
                pathway_info['relations'][(starter, receiver, tuple(interaction_types))] += 1

    return pathway_info

# Function to process all KGML files and write the CSV
def process_kgml_files(kgml_directory, output_csv, log_file):
    all_pathway_info = defaultdict(lambda: defaultdict(int))
    failed_files = []

    # Process each KGML file and aggregate the information
    for filename in os.listdir(kgml_directory):
        if filename.endswith('.kgml'):
            try:
                pathway_info = parse_kgml(os.path.join(kgml_directory, filename))
                for relation, count in pathway_info['relations'].items():
                    all_pathway_info[relation][pathway_info['pathway']] += count
            except ET.ParseError as e:
                # Log the error with the filename
                failed_files.append((filename, str(e)))

    # Write the failed files and errors to a log file
    with open(log_file, 'w') as logf:
        for file, error in failed_files:
            logf.write(f"{file}: {error}\n")
    
    # Write the information to a CSV file
    with open(output_csv, 'w', newline='') as csvfile:
        fieldnames = ['starter_ID', 'receiver_ID', 'interaction_types', 'pathways', 'credibility']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for (starter, receiver, interaction_types), pathways_info in all_pathway_info.items():
            pathway_counts = "; ".join(f"{path} ({count})" for path, count in pathways_info.items())
            total_count = sum(pathways_info.values())
            writer.writerow({
                'starter_ID': starter,
                'receiver_ID': receiver,
                'interaction_types': ", ".join(interaction_types),
                'pathways': pathway_counts,
                'credibility': total_count
            })

# Main function to start the process
def main():
    kgml_directory = 'KGML'
    output_csv = 'pathway_relations.csv'
    log_file = 'failed_kgml_files.txt'
    process_kgml_files(kgml_directory, output_csv, log_file)

if __name__ == "__main__":
    main()


Now it seems that the code parsed all the KGML file without giving out single one failed cased and stored the results in pathway_relations.csv. However, by checking the pathway map closer, I found that there are two relations, one is named relation, and the other is named reaction, only reaction have reversible and irrevesible type, so we don't need to deal with the reversible and irreversible relations when parsing the KGML only to extract the pathway relations. Now, I want you to revise the code to extract all the relations again, but also record the relation type in the finalized csv file. Moreover, when recording the source of the relation, part from using the name of the pathway map like path:hsa00250, please also include the title of the map: path:hsa00250 Alanine, aspartate and glutamate metabolism. The name and title of the map is stored in KGML in the following format: """<pathway name="path:hsa00250" org="hsa" number="00250"
         title="Alanine, aspartate and glutamate metabolism"
         image="https://www.kegg.jp/kegg/pathway/hsa/hsa00250.png"
         link="https://www.kegg.jp/kegg-bin/show_pathway?hsa00250">"""

In [4]:
import csv
import os
import xml.etree.ElementTree as ET
from collections import defaultdict
from itertools import product

def parse_kgml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    pathway_id = root.get('name')
    pathway_title = root.get('title')
    pathway_source = f"{pathway_id} {pathway_title}"

    entries = {}
    for entry in root.findall('entry'):
        entry_id = entry.get('id')
        entities = entry.get('name').split()
        entries[entry_id] = entities

    relations = defaultdict(lambda: {'count': 0, 'pathways': set()})
    for relation in root.findall('relation'):
        entry1 = relation.get('entry1')
        entry2 = relation.get('entry2')
        relation_type = relation.get('type')
        subtypes = [subtype.get('name') for subtype in relation.findall('subtype') if subtype.get('name') not in ["missing interaction", "indirect effect"]]

        if not subtypes:
            continue

        for starter, receiver in product(entries.get(entry1, []), entries.get(entry2, [])):
            for subtype_name in subtypes:
                # Create a unique key for each relation
                key = (starter, receiver, relation_type, subtype_name)
                relations[key]['count'] += 1
                relations[key]['pathways'].add(pathway_source)

    # Transform relations into a list of dictionaries for easier CSV writing
    relations_info = [
        {
            'starter_ID': key[0],
            'receiver_ID': key[1],
            'relation_type': key[2],
            'subtype_name': key[3],
            'pathway_source': ', '.join(rel['pathways']),
            'credibility': rel['count']
        }
        for key, rel in relations.items()
    ]

    reactions = []
    for reaction in root.findall('reaction'):
        reaction_id = reaction.get('id')
        reaction_type = reaction.get('type')
        substrates = [substrate.get('name') for substrate in reaction.findall('substrate')]
        products = [product.get('name') for product in reaction.findall('product')]

        reaction_entry = {
            'id': reaction_id,
            'type': reaction_type,
            'pathway': pathway_source,
            'substrates': substrates,
            'products': products
        }
        reactions.append(reaction_entry)

        # If the reaction is reversible, add a reversed entry
        if reaction_type == 'reversible':
            reversed_entry = {
                'id': reaction_id,
                'type': 'reversible_reverse',
                'pathway': pathway_source,
                'substrates': products,
                'products': substrates
            }
            reactions.append(reversed_entry)

    return {'relations_info': relations_info, 'reactions': reactions}

def write_to_csv(relations_info, reactions, relations_csv, reactions_csv):
    # Write relations to CSV
    with open(relations_csv, 'w', newline='') as csvfile:
        fieldnames = ['starter_ID', 'receiver_ID', 'relation_type', 'subtype_name', 'pathway_source', 'credibility']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for relation in relations_info:
            writer.writerow(relation)

    # Write reactions to CSV, finding the maximum number of substrates and products
    max_subs = max((len(r['substrates']) for r in reactions), default=0)
    max_prods = max((len(r['products']) for r in reactions), default=0)
    reaction_fieldnames = ['id', 'type', 'pathway'] + \
        [f'substrate{i+1}' for i in range(max_subs)] + \
        [f'product{i+1}' for i in range(max_prods)]
    
    with open(reactions_csv, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=reaction_fieldnames)
        writer.writeheader()
        for reaction in reactions:
            row = {
                'id': reaction['id'],
                'type': reaction['type'],
                'pathway': reaction['pathway'],
            }
            row.update({f'substrate{i+1}': sub for i, sub in enumerate(reaction['substrates'])})
            row.update({f'product{i+1}': prod for i, prod in enumerate(reaction['products'])})
            writer.writerow(row)

# Main function to start the process
def main():
    kgml_directory = 'KGML'
    relations_csv = 'pathway_relations.csv'
    reactions_csv = 'pathway_reactions.csv'
    all_relations_info = []
    all_reactions = []

    # Process each KGML file and aggregate the information
    for filename in os.listdir(kgml_directory):
        if filename.endswith('.kgml'):
            file_path = os.path.join(kgml_directory, filename)
            parsed_data = parse_kgml(file_path)
            all_relations_info.extend(parsed_data['relations_info'])
            all_reactions.extend(parsed_data['reactions'])

    # Write the aggregated information to CSV files
    write_to_csv(all_relations_info, all_reactions, relations_csv, reactions_csv)

if __name__ == "__main__":
    main()

In [8]:
import csv

def extract_unique_ids(relations_csv, reactions_csv):
    unique_ids = set()

    # Extract from pathway_relations.csv
    with open(relations_csv, mode='r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            unique_ids.add(row['starter_ID'])
            unique_ids.add(row['receiver_ID'])

    # Extract from pathway_reactions.csv
    with open(reactions_csv, mode='r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            for key in row.keys():
                if 'substrate' in key or 'product' in key:
                    unique_ids.add(row[key])

    return unique_ids

# Use the function to extract IDs
relations_csv = 'pathway_relations.csv'  # Make sure to provide the correct path to your CSV file
reactions_csv = 'pathway_reactions.csv'  # Make sure to provide the correct path to your CSV file
entity_ids = extract_unique_ids(relations_csv, reactions_csv)

# Now you have all unique entity IDs in the `entity_ids` set
print(f"Extracted {len(entity_ids)} unique entity IDs.")

def extract_unique_id_prefixes(unique_ids):
    # Extract the first three characters of each ID and count their occurrences
    prefix_counts = {}
    for kegg_id in unique_ids:
        # Extract the first three characters
        prefix = kegg_id[:3]
        if prefix:
            prefix_counts[prefix] = prefix_counts.get(prefix, 0) + 1
    return prefix_counts

# Process the unique ID prefixes
prefix_counts = extract_unique_id_prefixes(entity_ids)

# Print the unique prefixes and their counts
for prefix, count in prefix_counts.items():
    print(f"Prefix: {prefix}, Count: {count}")

Extracted 8230 unique entity IDs.
Prefix: hsa, Count: 6281
Prefix: cpd, Count: 1602
Prefix: gl:, Count: 205
Prefix: pat, Count: 103
Prefix: dr:, Count: 37
Prefix: und, Count: 1


In [9]:
def remove_undefined_rows(input_csv, output_csv):
    # Read the existing CSV file
    with open(input_csv, mode='r') as infile:
        reader = csv.DictReader(infile)
        rows = [row for row in reader if row['starter_ID'] != 'undefined' and row['receiver_ID'] != 'undefined']

    # Write the cleaned data to a new CSV file
    with open(output_csv, mode='w', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
        writer.writeheader()
        writer.writerows(rows)

# Assuming 'pathway_relations.csv' is the name of your input file
input_csv = 'pathway_relations.csv'
output_csv = 'pathway_relations_cleaned.csv'

# Call the function to remove rows with 'undefined' IDs
remove_undefined_rows(input_csv, output_csv)

In [12]:
import csv
import random

def load_data(csv_file):
    with open(csv_file, mode='r') as infile:
        reader = csv.DictReader(infile)
        data = [row for row in reader]
    return data

def identify_unique_pathways(data):
    unique_pathways = set()
    for row in data:
        pathways = row['pathway_source'].split(', ')
        for pathway in pathways:
            unique_pathways.add(pathway)
    return list(unique_pathways)

def split_pathways(pathways, splits=(0.7, 0.15, 0.15)):
    random.shuffle(pathways)
    n = len(pathways)
    train_end = int(splits[0] * n)
    val_end = train_end + int(splits[1] * n)
    train, val, test = pathways[:train_end], pathways[train_end:val_end], pathways[val_end:]
    return train, val, test

def split_data(data, train_pathways, val_pathways, test_pathways):
    train, val, test = [], [], []
    for row in data:
        sources = set(row['pathway_source'].split(', '))
        # Priority: test > val > train
        if sources & set(test_pathways):
            test.append(row)
        elif sources & set(val_pathways):
            val.append(row)
        else:
            train.append(row)
    return train, val, test

def write_data(data, filename, fieldnames):
    with open(filename, mode='w', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

relations_data = load_data('pathway_relations_cleaned.csv')
reactions_data = load_data('pathway_reactions.csv')

unique_pathways = identify_unique_pathways(relations_data + reactions_data)
train_pathways, val_pathways, test_pathways = split_pathways(unique_pathways)

relations_train, relations_val, relations_test = split_data(relations_data, train_pathways, val_pathways, test_pathways)
reactions_train, reactions_val, reactions_test = split_data(reactions_data, train_pathways, val_pathways, test_pathways)

write_data(relations_train, 'relations_train.csv', relations_data[0].keys())
write_data(relations_val, 'relations_val.csv', relations_data[0].keys())
write_data(relations_test, 'relations_test.csv', relations_data[0].keys())

write_data(reactions_train, 'reactions_train.csv', reactions_data[0].keys())
write_data(reactions_val, 'reactions_val.csv', reactions_data[0].keys())
write_data(reactions_test, 'reactions_test.csv', reactions_data[0].keys())

print(f"Total number of relations - Train: {len(relations_train)}, Val: {len(relations_val)}, Test: {len(relations_test)}")
print(f"Total number of reactions - Train: {len(reactions_train)}, Val: {len(reactions_val)}, Test: {len(reactions_test)}")



Total number of relations - Train: 69339, Val: 14385, Test: 20312
Total number of reactions - Train: 4762, Val: 411, Test: 721


In [13]:
def load_ids_from_csv(csv_file):
    unique_ids = set()
    with open(csv_file, mode='r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            unique_ids.add(row['starter_ID'])
            unique_ids.add(row['receiver_ID'])
    return unique_ids

def check_id_coverage(train_csv, val_csv, test_csv):
    train_ids = load_ids_from_csv(train_csv)
    val_ids = load_ids_from_csv(val_csv)
    test_ids = load_ids_from_csv(test_csv)

    val_not_in_train = val_ids - train_ids
    test_not_in_train = test_ids - train_ids

    print(f"Validation IDs not in Train: {len(val_not_in_train)}")
    print(f"Test IDs not in Train: {len(test_not_in_train)}")

    return val_not_in_train, test_not_in_train

# Assuming CSV files are named as follows:
train_csv = 'relations_train.csv'
val_csv = 'relations_val.csv'
test_csv = 'relations_test.csv'

# Check the ID coverage
val_not_covered, test_not_covered = check_id_coverage(train_csv, val_csv, test_csv)

# If you also want to see the specific IDs not covered, you can print val_not_covered and test_not_covered
print("Validation IDs not in Train:", val_not_covered)
print("Test IDs not in Train:", test_not_covered)

Validation IDs not in Train: 403
Test IDs not in Train: 1354
Validation IDs not in Train: {'hsa:25970', 'hsa:123688', 'hsa:3145', 'hsa:8876', 'hsa:3770', 'hsa:5689', 'hsa:3795', 'hsa:8554', 'hsa:10963', 'hsa:9825', 'hsa:1768', 'hsa:23237', 'hsa:85358', 'hsa:4091', 'hsa:8473', 'hsa:51005', 'hsa:3972', 'hsa:6675', 'hsa:25981', 'hsa:170712', 'hsa:645', 'cpd:C20793', 'hsa:9739', 'hsa:3077', 'hsa:6839', 'hsa:6662', 'hsa:164668', 'hsa:9377', 'hsa:2592', 'gl:G00369', 'cpd:C00017', 'hsa:60436', 'hsa:1347', 'hsa:56681', 'hsa:1262', 'hsa:79709', 'hsa:387893', 'cpd:C05776', 'hsa:79823', 'hsa:60489', 'hsa:2488', 'hsa:23645', 'hsa:100526767', 'hsa:58508', 'hsa:3163', 'hsa:10961', 'hsa:4905', 'hsa:269', 'hsa:433', 'hsa:9623', 'hsa:27018', 'hsa:10130', 'hsa:4512', 'hsa:337876', 'hsa:10113', 'cpd:C01089', 'hsa:54187', 'hsa:4741', 'hsa:9757', 'hsa:1174', 'hsa:1356', 'hsa:2584', 'hsa:6419', 'hsa:165324', 'hsa:7067', 'hsa:1340', 'hsa:25956', 'hsa:9765', 'hsa:1329', 'hsa:53354', 'hsa:4513', 'hsa:60496', '

In [14]:
# Adjusted script to work with the provided reactions set structure

def load_ids_from_csv(csv_file, id_fields):
    unique_ids = set()
    with open(csv_file, mode='r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            for id_field in id_fields:
                if row[id_field] != '':  # Ensure the field is not empty
                    unique_ids.add(row[id_field])
    return unique_ids

def remove_unseen_entities(data, train_ids, id_fields):
    cleaned_data = []
    for row in data:
        if all(row[id_field] in train_ids or row[id_field] == '' for id_field in id_fields):
            cleaned_data.append(row)
    return cleaned_data

def check_id_coverage_and_clean(train_csv, val_csv, test_csv, id_fields):
    train_ids = load_ids_from_csv(train_csv, id_fields)
    val_ids = load_ids_from_csv(val_csv, id_fields)
    test_ids = load_ids_from_csv(test_csv, id_fields)

    val_not_in_train = val_ids - train_ids
    test_not_in_train = test_ids - train_ids

    print(f"Validation IDs not in Train: {len(val_not_in_train)}")
    print(f"Test IDs not in Train: {len(test_not_in_train)}")

    # Load the actual data
    val_data = load_data(val_csv)
    test_data = load_data(test_csv)

    # Clean the validation and test data
    cleaned_val_data = remove_unseen_entities(val_data, train_ids, id_fields)
    cleaned_test_data = remove_unseen_entities(test_data, train_ids, id_fields)

    # Write the cleaned data to new CSV files
    write_data(cleaned_val_data, 'cleaned_' + val_csv, val_data[0].keys())
    write_data(cleaned_test_data, 'cleaned_' + test_csv, test_data[0].keys())

    return val_not_in_train, test_not_in_train

# Define the ID fields for reactions
reaction_id_fields = [
    'substrate1', 'substrate2', 'substrate3', 'substrate4', 'substrate5',
    'product1', 'product2', 'product3', 'product4', 'product5'
]

# Assuming CSV files are named as follows:
train_csv = 'reactions_train.csv'
val_csv = 'reactions_val.csv'
test_csv = 'reactions_test.csv'

# Check the ID coverage and clean the datasets for reactions
reactions_val_not_covered, reactions_test_not_covered = check_id_coverage_and_clean(
    train_csv,
    val_csv,
    test_csv,
    reaction_id_fields
)

# Output the number of entities not covered
print("Validation Reaction IDs not in Train:", reactions_val_not_covered)
print("Test Reaction IDs not in Train:", reactions_test_not_covered)


Validation IDs not in Train: 29
Test IDs not in Train: 66
Validation Reaction IDs not in Train: {'cpd:C16221', 'cpd:C16173', 'cpd:C16220', 'cpd:C00999', 'cpd:C04088', 'cpd:C03691', 'cpd:C06508', 'cpd:C05399', 'cpd:C16217', 'cpd:C16300', 'cpd:C16216', 'cpd:C05774', 'cpd:C05400', 'cpd:C16218', 'cpd:C14818', 'cpd:C16375', 'cpd:C05401', 'cpd:C16389', 'cpd:C04079', 'cpd:C16388', 'cpd:C03410', 'cpd:C16374', 'cpd:C03688', 'cpd:C00054', 'cpd:C16376', 'cpd:C14819', 'cpd:C00996', 'cpd:C01235', 'cpd:C16387'}
Test Reaction IDs not in Train: {'cpd:C16550', 'cpd:C15976', 'gl:G10794', 'cpd:C11583', 'cpd:C07644', 'cpd:C16549', 'cpd:C16608', 'dr:D04716 cpd:C08012', 'dr:D07704 cpd:C07572', 'cpd:C07645', 'gl:G13153', 'cpd:C16586', 'cpd:C15974', 'dr:D00399 cpd:C07185', 'cpd:C00027', 'dr:D08233 cpd:C01516', 'dr:D00358 cpd:C07073', 'cpd:C16546', 'cpd:C16551', 'cpd:C07643', 'cpd:C16578', 'dr:D00195 cpd:C06174', 'cpd:C00349', 'cpd:C11038', 'cpd:C06049', 'cpd:C16662', 'cpd:C16587', 'cpd:C16596', 'cpd:C16609', 

In [7]:
# Adjusted script to work with the provided relations set structure
import csv

def load_data(csv_file):
    with open(csv_file, mode='r') as infile:
        reader = csv.DictReader(infile)
        data = [row for row in reader]
    return data

def load_ids_from_csv(csv_file, id_fields):
    unique_ids = set()
    with open(csv_file, mode='r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            # Check if the relation_type is not 'no_relation'
            if row.get('relation_type', '') != 'no_relation':
                for id_field in id_fields:
                    if row[id_field] != '':  # Ensure the field is not empty
                        unique_ids.add(row[id_field])
    return unique_ids

def remove_unseen_entities(data, train_ids, id_fields):
    cleaned_data = []
    for row in data:
        if all(row[id_field] in train_ids or row[id_field] == '' for id_field in id_fields):
            cleaned_data.append(row)
    return cleaned_data

def remove_unseen_entities_relations(data, train_ids):
    cleaned_data = []
    for row in data:
        # Check if both starter and receiver IDs are in the training set
        if row['starter_ID'] in train_ids and row['receiver_ID'] in train_ids:
            cleaned_data.append(row)
    return cleaned_data

def write_data(data, filename, fieldnames):
    with open(filename, mode='w', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

def check_id_coverage_and_clean_relations(train_csv, val_csv, test_csv):
    # Load the unique IDs from the training set
    train_ids = load_ids_from_csv(train_csv, ['starter_ID', 'receiver_ID'])
    
    # Load the validation and test sets
    val_data = load_data(val_csv)
    test_data = load_data(test_csv)
    
    # Clean the validation and test data by removing unseen entities
    cleaned_val_data = remove_unseen_entities_relations(val_data, train_ids)
    cleaned_test_data = remove_unseen_entities_relations(test_data, train_ids)

    # Write the cleaned data to new CSV files
    write_data(cleaned_val_data, 'cleaned_' + val_csv, val_data[0].keys())
    write_data(cleaned_test_data, 'cleaned_' + test_csv, test_data[0].keys())

    # Return the counts of the original and cleaned datasets for comparison
    return len(val_data), len(cleaned_val_data), len(test_data), len(cleaned_test_data)

# File paths for the relations datasets
train_csv = 'relations_train_final.csv'
val_csv = 'relations_val_final.csv'
test_csv = 'relations_test_final.csv'

# Check the ID coverage and clean the datasets for relations
original_val_count, cleaned_val_count, original_test_count, cleaned_test_count = check_id_coverage_and_clean_relations(
    train_csv, val_csv, test_csv
)

# Output the number of entities before and after cleaning
print(f"Original Validation Set Count: {original_val_count}, Cleaned: {cleaned_val_count}")
print(f"Original Test Set Count: {original_test_count}, Cleaned: {cleaned_test_count}")


Original Validation Set Count: 18133, Cleaned: 18128
Original Test Set Count: 14220, Cleaned: 14173


In [17]:
import csv
from collections import Counter

def count_unique_subtypes_proportions(csv_file):
    subtypes = Counter()
    total_count = 0
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            subtypes[row['subtype_name']] += 1
            total_count += 1
    
    proportions = {subtype: count / total_count for subtype, count in subtypes.items()}
    return subtypes, proportions, total_count

# Paths to the CSV files
csv_files = ['relations_train.csv', 'cleaned_relations_val.csv', 'cleaned_relations_test.csv']

# Count and print the number and proportion of each unique subtype_name in each file
for csv_file in csv_files:
    subtypes_count, proportions, total_count = count_unique_subtypes_proportions(csv_file)
    print(f"File: {csv_file} (Total Relations: {total_count})")
    for subtype, count in subtypes_count.items():
        print(f"  Subtype: {subtype}, Count: {count}, Proportion: {proportions[subtype]:.2f}")
    print("\n")


File: relations_train.csv (Total Relations: 69339)
  Subtype: compound, Count: 19259, Proportion: 0.28
  Subtype: activation, Count: 27615, Proportion: 0.40
  Subtype: phosphorylation, Count: 5023, Proportion: 0.07
  Subtype: inhibition, Count: 6283, Proportion: 0.09
  Subtype: expression, Count: 3163, Proportion: 0.05
  Subtype: binding/association, Count: 5770, Proportion: 0.08
  Subtype: dephosphorylation, Count: 1439, Proportion: 0.02
  Subtype: dissociation, Count: 211, Proportion: 0.00
  Subtype: ubiquitination, Count: 171, Proportion: 0.00
  Subtype: methylation, Count: 8, Proportion: 0.00
  Subtype: repression, Count: 102, Proportion: 0.00
  Subtype: state change, Count: 249, Proportion: 0.00
  Subtype: indirect, Count: 46, Proportion: 0.00


File: cleaned_relations_val.csv (Total Relations: 12292)
  Subtype: compound, Count: 708, Proportion: 0.06
  Subtype: activation, Count: 7285, Proportion: 0.59
  Subtype: phosphorylation, Count: 1042, Proportion: 0.08
  Subtype: inhibition

In [5]:
import pandas as pd
import random

def load_csv(file_name):
    return pd.read_csv(file_name)

def save_csv(df, file_name):
    df.to_csv(file_name, index=False)

def generate_no_relation_entries(relations_df, all_relations):
    no_relation_count = int(len(relations_df) * 0.5)
    no_relation_entries = []

    while len(no_relation_entries) < no_relation_count:
        starter = random.choice(relations_df['starter_ID'])
        receiver = random.choice(relations_df['receiver_ID'])

        if (starter, receiver) not in all_relations and (receiver, starter) not in all_relations:
            no_relation_entries.append([starter, receiver, 'no_relation', 'no_relation', 'no_relation', 'no_relation'])
            all_relations.add((starter, receiver))
            all_relations.add((receiver, starter))

    no_relation_df = pd.DataFrame(no_relation_entries, columns=relations_df.columns)
    return pd.concat([relations_df, no_relation_df]).sample(frac=1).reset_index(drop=True)

# Load existing relations
train_relations = load_csv('relations_train.csv')
val_relations = load_csv('cleaned_relations_val.csv')
test_relations = load_csv('cleaned_relations_test.csv')

# Combine all relations to ensure no duplicates
all_relations = set()
for df in [train_relations, val_relations, test_relations]:
    for _, row in df.iterrows():
        all_relations.add((row['starter_ID'], row['receiver_ID']))
        all_relations.add((row['receiver_ID'], row['starter_ID']))

# Generate 'no_relation' entries
train_final = generate_no_relation_entries(train_relations, all_relations)
val_final = generate_no_relation_entries(val_relations, all_relations)
test_final = generate_no_relation_entries(test_relations, all_relations)

# Save the final datasets
save_csv(train_final, 'relations_train_final.csv')
save_csv(val_final, 'relations_val_final.csv')
save_csv(test_final, 'relations_test_final.csv')

In [6]:
import csv
from collections import Counter

def count_unique_subtypes_proportions(csv_file):
    subtypes = Counter()
    total_count = 0
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            subtypes[row['subtype_name']] += 1
            total_count += 1
    
    proportions = {subtype: count / total_count for subtype, count in subtypes.items()}
    return subtypes, proportions, total_count

# Paths to the CSV files
csv_files = ['relations_train_final.csv', 'cleaned_relations_val_final.csv', 'cleaned_relations_test_final.csv']

# Count and print the number and proportion of each unique subtype_name in each file
for csv_file in csv_files:
    subtypes_count, proportions, total_count = count_unique_subtypes_proportions(csv_file)
    print(f"File: {csv_file} (Total Relations: {total_count})")
    for subtype, count in subtypes_count.items():
        print(f"  Subtype: {subtype}, Count: {count}, Proportion: {proportions[subtype]:.2f}")
    print("\n")


File: relations_train_final.csv (Total Relations: 101786)
  Subtype: compound, Count: 19259, Proportion: 0.19
  Subtype: binding/association, Count: 5770, Proportion: 0.06
  Subtype: activation, Count: 27615, Proportion: 0.27
  Subtype: phosphorylation, Count: 5023, Proportion: 0.05
  Subtype: no_relation, Count: 34669, Proportion: 0.34
  Subtype: inhibition, Count: 6283, Proportion: 0.06
  Subtype: expression, Count: 3163, Proportion: 0.03
  Subtype: state change, Count: 4, Proportion: 0.00


File: cleaned_relations_val_final.csv (Total Relations: 18129)
  Subtype: no_relation, Count: 6143, Proportion: 0.34
  Subtype: expression, Count: 729, Proportion: 0.04
  Subtype: activation, Count: 7284, Proportion: 0.40
  Subtype: phosphorylation, Count: 1042, Proportion: 0.06
  Subtype: binding/association, Count: 443, Proportion: 0.02
  Subtype: inhibition, Count: 1780, Proportion: 0.10
  Subtype: compound, Count: 708, Proportion: 0.04


File: cleaned_relations_test_final.csv (Total Relations

In [10]:
import pandas as pd

# Define the paths to the CSV files
train_csv_path = 'relations_train_final.csv'
val_csv_path = 'cleaned_relations_val_final.csv'
test_csv_path = 'cleaned_relations_test_final.csv'

# Define the weight mapping for each interaction type
weight_mapping = {
    'no_relation': 0,
    'activation': 0.8,
    'inhibition': 0.8,
    'compound': 0.5,
    'binding/association': 0.5,
    'phosphorylation': 0.8,
    'expression': 0.5
}

# Function to add weights to a CSV file based on the interaction type
def add_weights_to_csv(csv_path):
    df = pd.read_csv(csv_path)
    df['weight'] = df['subtype_name'].map(weight_mapping)
    df.to_csv(csv_path, index=False)

# Add weights to each of the CSV files
add_weights_to_csv(train_csv_path)
add_weights_to_csv(val_csv_path)
add_weights_to_csv(test_csv_path)

In [7]:
import pandas as pd

# File paths
train_csv = 'relations_train_final.csv'
val_csv = 'relations_val_final.csv'
test_csv = 'relations_test_final.csv'

# Load the data
train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)
test_df = pd.read_csv(test_csv)

# Count the occurrences of each relation subtype in each set
train_subtype_counts = train_df['subtype_name'].value_counts()
val_subtype_counts = val_df['subtype_name'].value_counts()
test_subtype_counts = test_df['subtype_name'].value_counts()

# Determine the subtypes with less than 100 occurrences in each set
subtypes_to_remove = set()

for subtype, count in train_subtype_counts.items():
    if count < 100:
        subtypes_to_remove.add(subtype)

for subtype, count in val_subtype_counts.items():
    if count < 100:
        subtypes_to_remove.add(subtype)

for subtype, count in test_subtype_counts.items():
    if count < 100:
        subtypes_to_remove.add(subtype)

# Remove the identified subtypes from each dataframe
train_df = train_df[~train_df['subtype_name'].isin(subtypes_to_remove)]
val_df = val_df[~val_df['subtype_name'].isin(subtypes_to_remove)]
test_df = test_df[~test_df['subtype_name'].isin(subtypes_to_remove)]

# Save the cleaned dataframes
train_df.to_csv(train_csv, index=False)
val_df.to_csv(val_csv, index=False)
test_df.to_csv(test_csv, index=False)

# Report the subtypes removed and the count removed from each set
removed_counts = {
    "train_removed": {subtype: train_subtype_counts.get(subtype, 0) for subtype in subtypes_to_remove},
    "val_removed": {subtype: val_subtype_counts.get(subtype, 0) for subtype in subtypes_to_remove},
    "test_removed": {subtype: test_subtype_counts.get(subtype, 0) for subtype in subtypes_to_remove}
}

removed_counts

{'train_removed': {'indirect': 46,
  'dissociation': 211,
  'methylation': 8,
  'repression': 102,
  'state change': 249,
  'dephosphorylation': 1439,
  'ubiquitination': 171},
 'val_removed': {'indirect': 0,
  'dissociation': 40,
  'methylation': 0,
  'repression': 8,
  'state change': 36,
  'dephosphorylation': 215,
  'ubiquitination': 6},
 'test_removed': {'indirect': 0,
  'dissociation': 40,
  'methylation': 0,
  'repression': 28,
  'state change': 86,
  'dephosphorylation': 89,
  'ubiquitination': 4}}

In [10]:
import csv

def extract_unique_ids(relations_csv):
    unique_ids = set()

    # Extract from pathway_relations.csv
    with open(relations_csv, mode='r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            unique_ids.add(row['starter_ID'])
            unique_ids.add(row['receiver_ID'])

    return unique_ids

# Use the function to extract IDs
relations_csv = 'cleaned_relations_test_final.csv'
entity_ids = extract_unique_ids(relations_csv)

# Now you have all unique entity IDs in the `entity_ids` set
print(f"Extracted {len(entity_ids)} unique entity IDs.")

def extract_unique_id_prefixes(unique_ids):
    # Extract the first three characters of each ID and count their occurrences
    prefix_counts = {}
    for kegg_id in unique_ids:
        # Extract the first three characters
        prefix = kegg_id[:3]
        if prefix:
            prefix_counts[prefix] = prefix_counts.get(prefix, 0) + 1
    return prefix_counts

# Process the unique ID prefixes
prefix_counts = extract_unique_id_prefixes(entity_ids)

# Print the unique prefixes and their counts
for prefix, count in prefix_counts.items():
    print(f"Prefix: {prefix}, Count: {count}")

Extracted 1525 unique entity IDs.
Prefix: pat, Count: 38
Prefix: hsa, Count: 1434
Prefix: cpd, Count: 53
