#### Change zzz#### into normal id

In [2]:
import xml.etree.ElementTree as ET

# Define the namespace
namespaces = {'xgmml': 'http://www.cs.rpi.edu/XGMML'}

# Parse the XML
tree = ET.parse('../data/ssn/xgmml_file/128912_128865_PF05147_hits_200_500aa_latentspace_AS52_full_ssn.xgmml')
root = tree.getroot()

# Register the namespace for proper prefixing
ET.register_namespace('', namespaces['xgmml'])

# Dictionary to store the original ID and the new ID
id_mapping = {}

# Update node ids and labels
for node in root.findall('xgmml:node', namespaces=namespaces):
    description = node.find(".//xgmml:att[@name='Description']/xgmml:att[@name='Description']", namespaces=namespaces)
    if description is not None:
        node_id = description.get('value')
        id_mapping[node.get('id')] = node_id  # store the original and new ids
        node.set('id', node_id)
        node.set('label', node_id)

# Update edge attributes
for edge in root.findall('xgmml:edge', namespaces=namespaces):
    source = edge.get('source')
    target = edge.get('target')
    
    new_source = id_mapping.get(source, source)  # get new ID if exists else use original
    new_target = id_mapping.get(target, target)  # get new ID if exists else use original
    
    edge.set('source', new_source)
    edge.set('target', new_target)
    
    new_id = f"{new_source},{new_target}"
    edge.set('id', new_id)
    edge.set('label', new_id)

# Save the modified XML back to a file
tree.write('../data/ssn/xgmml_file/128912_128865_PF05147_hits_200_500aa_latentspace_AS52_full_ssn_id.xgmml', encoding='utf-8', xml_declaration=True)


#### Process SSN data

In [12]:
import pandas as pd

# Load the csv file
df = pd.read_csv("../data/ssn/csv_file/class_I-precursors_peptide_c_u_AS14 Full Network colorized default node.csv")

# Sort rows by "Node Count Cluster Number" in ascending order
sorted_df = df.sort_values(by="Node Count Cluster Number")

# Slice the first 14 characters of each string in column 'A'
sorted_df['name_sliced'] = sorted_df['name'].str[:14]

# Remove duplicate rows based on the sliced column 'A_sliced'
sorted_df_unique = sorted_df.drop_duplicates(subset='name_sliced', keep='first')

# Optionally, you can drop the temporary 'A_sliced' column if you no longer need it
sorted_df_unique = sorted_df_unique.drop(columns=['name_sliced'])

# Count the occurrences of each value in column 'A'
cluster_counts = sorted_df_unique['Node Count Cluster Number'].value_counts()

# Filter the DataFrame to only include rows where the count of the value in 'A' is n or more
n = 10
sorted_df_unique_filtered = sorted_df_unique[sorted_df_unique['Node Count Cluster Number'].map(cluster_counts) >= n]

# To save the filtered and sorted dataframe to a new Excel file
sorted_df_unique_filtered.to_excel("../data/ssn/csv_file/class_I-precursors_peptide_c_u_AS14 Full Network colorized default node_sorted_unique.xlsx", index=False)


#### Color SSNs based on different classes

In [11]:
import xml.etree.ElementTree as ET
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Read the summarized file of the cyclases in the latent space and return a dictionary
def seq2class(file_path):
    
    # Read the xlsx file
    df = pd.read_csv(file_path)
    
    # Ensure the columns 'seq_id' and 'class' exist in the dataframe
    if 'Query' not in df.columns or 'Classification' not in df.columns:
        raise ValueError("The input file must contain 'Query' and 'Classification' columns.")
    
    df = df.drop_duplicates(subset='Query')

    # Create a dictionary from seq_id to class
    id2class = dict(zip(df['Query'], df['Classification']))
    
    return id2class

file_path = "/home/cdchiang/vae/RiPPs/data/RODEO_lanthipeptides/main_co_occur_all_classification_file.csv"
id2class = seq2class(file_path)

# Higher level clustering

id2bigclass = {}
for id in id2class.keys():
    if id2class[id] == 'class_I':
        id2bigclass[id] = 'cluster_1'
    elif id2class[id] == 'class_II':
        id2bigclass[id] = 'cluster_2'
    elif id2class[id] == 'class_III':
        id2bigclass[id] = 'cluster_3'
    elif id2class[id] == 'class_IV':
        id2bigclass[id] = 'cluster_4'
    else:
        id2bigclass[id] = 'unclassified'

# Sequences already included in the previous paper
# Read fasta file
def parse_fasta(file_path):
    with open(file_path, "r") as handle:
        return ['WP_' + record.id.split('_')[1] for record in SeqIO.parse(handle, "fasta")]

class_I_ids = parse_fasta('/home/cdchiang/vae/RiPPs/data/lanthipeptide/class_I-precursors_peptide.fasta')
class_II_ids = parse_fasta('/home/cdchiang/vae/RiPPs/data/lanthipeptide/class_II-precursors_peptide.fasta')
class_III_ids = parse_fasta('/home/cdchiang/vae/RiPPs/data/lanthipeptide/class_III-precursors_peptide.fasta')
class_IV_ids = parse_fasta('/home/cdchiang/vae/RiPPs/data/lanthipeptide/class_IV-precursors_peptide.fasta')

id2bigclass.update({id: 'cluster_1' for id in class_I_ids})
id2bigclass.update({id: 'cluster_2' for id in class_II_ids})
id2bigclass.update({id: 'cluster_3' for id in class_III_ids})
id2bigclass.update({id: 'cluster_4' for id in class_IV_ids})

# Dictionary to map node IDs to cluster colors
cluster_colors = {
    'cluster_1': '#482677',  # Example color mapping
    'cluster_2': '#2D708E',
    'cluster_3': '#29AF7F',
    'cluster_4': '#B8DE29',
    'Unknown': '#D3D3D3'
    # Add more clusters and their corresponding colors
}

# Function to get color for a node based on its cluster
def get_color_for_node(node_id):
    if node_id in id2bigclass:
        cluster = id2bigclass[node_id]
        return cluster_colors.get(cluster, '#D3D3D3')  # Default color for unclassified enzymes (gray)
    else:
        return '#D3D3D3'  # Default color for unclassified enzymes (gray)

# Define the namespace
namespaces = {'xgmml': 'http://www.cs.rpi.edu/XGMML'}

# Parse the XML
tree = ET.parse('/home/cdchiang/vae/RiPPs/data/ssn/xgmml_file/128911_128865_PF05147_hits_200_500aa_latentspace_AS46_full_ssn.xgmml')
root = tree.getroot()

# Register the namespace for proper prefixing
ET.register_namespace('', namespaces['xgmml'])

# Dictionary to store the original ID and the new ID
id_mapping = {}

# Update node ids, labels, and colors
for node in root.findall('xgmml:node', namespaces=namespaces):
    description = node.find(".//xgmml:att[@name='Description']/xgmml:att[@name='Description']", namespaces=namespaces)
    if description is not None:
        node_id = description.get('value')
        id_mapping[node.get('id')] = node_id  # store the original and new ids
        node.set('id', node_id)
        node.set('label', node_id)
        
        # Add color attribute
        color = get_color_for_node(node_id[0:14])
        fill_color_att = node.find(".//xgmml:att[@name='node.fillColor']", namespaces=namespaces)
        if fill_color_att is None:
            fill_color_att = ET.SubElement(node, '{http://www.cs.rpi.edu/XGMML}att', {'name': 'node.fillColor', 'type': 'string'})
        fill_color_att.set('value', color)

        node_count_fill_color_att = node.find(".//xgmml:att[@name='Node Count Fill Color']", namespaces=namespaces)
        if node_count_fill_color_att is None:
            node_count_fill_color_att = ET.SubElement(node, '{http://www.cs.rpi.edu/XGMML}att', {'name': 'Node Count Fill Color', 'type': 'string'})
        node_count_fill_color_att.set('value', color)

# Update edge attributes
for edge in root.findall('xgmml:edge', namespaces=namespaces):
    source = edge.get('source')
    target = edge.get('target')
    
    new_source = id_mapping.get(source, source)  # get new ID if exists else use original
    new_target = id_mapping.get(target, target)  # get new ID if exists else use original
    
    edge.set('source', new_source)
    edge.set('target', new_target)
    
    new_id = f"{new_source},{new_target}"
    edge.set('id', new_id)
    edge.set('label', new_id)

# Save the modified XML back to a file
tree.write('/home/cdchiang/vae/RiPPs/data/ssn/xgmml_file/128911_128865_PF05147_hits_200_500aa_latentspace_AS46_full_ssn_id_color.xgmml', encoding='utf-8', xml_declaration=True)
