In [48]:
import pandas as pd

# 读取CSV文件
uniprot_to_shortname_df = pd.read_csv('CARSONELLA LIST.csv', header=None, names=['Uniprot_ID', 'Short_Name', 'Full_Name', 'Family','Abbr'])

# 显示DataFrame内容
print(uniprot_to_shortname_df)


    Uniprot_ID Short_Name                 Full_Name           Family  Abbr
0       Q05FT9       CH60         Chaperonin GroEL         Chaperone  CHAP
1       Q05FT8     CHAP10        10 kDa chaperonin         Chaperone  CHAP
2       Q05FS9   ChapGRPE   Chaperone protein GrpE         Chaperone  CHAP
3       Q05FS8       DNAK   Chaperone protein DnaK         Chaperone  CHAP
4       Q05FF9        CSP       Cold shock protein        Cold shock   CSH
..         ...        ...                       ...              ...   ...
177     Q05FW6       Unch  Uncharacterized protein   Uncharacterized   UNC
178     Q05FX4       Unch  Uncharacterized protein   Uncharacterized   UNC
179     Q05FX6       Unch  Uncharacterized protein   Uncharacterized   UNC
180     Q05FY0       Unch  Uncharacterized protein   Uncharacterized   UNC
181     Q05FY4       Unch  Uncharacterized protein   Uncharacterized   UNC

[182 rows x 5 columns]


In [3]:
import pandas as pd

def read_uniprot_mapping(csv_file):
    return pd.read_csv(csv_file, header=None, names=['Uniprot_ID', 'Short_Name','Full_name','Family', 'Family_Abbr'])
         


def replace_names_in_tree(tree_file, mapping_df):

    with open(tree_file, 'r') as file:
        tree_content = file.read()
    
    mapping_dict = {}
    for index, row in mapping_df.iterrows():
        if row['Short_Name'] == 'Unch':

            mapping_dict[row['Uniprot_ID']] = row['Uniprot_ID']
        else:

            mapping_dict[row['Uniprot_ID']] = f"{row['Family_Abbr']}|{row['Short_Name']}"
    print(mapping_dict)
    

    for uniprot_id, name in mapping_dict.items():
        tree_content = tree_content.replace(uniprot_id, name)
    
    return tree_content

def save_tree(new_tree_content, output_file):
    with open(output_file, 'w') as file:
        file.write(new_tree_content)

def main():
    mapping_df = read_uniprot_mapping('CARSONELLA LIST.csv')
    print(mapping_df)
    new_tree_content = replace_names_in_tree('outtree', mapping_df)
    save_tree(new_tree_content, 'CR_FATCAT_consensus_updated_tree.txt')

if __name__ == "__main__":
    main()


    Uniprot_ID Short_Name                 Full_name           Family  \
0       Q05FT9       CH60         Chaperonin GroEL         Chaperone   
1       Q05FT8     CHAP10        10 kDa chaperonin         Chaperone   
2       Q05FS9   ChapGRPE   Chaperone protein GrpE         Chaperone   
3       Q05FS8       DNAK   Chaperone protein DnaK         Chaperone   
4       Q05FF9        CSP       Cold shock protein        Cold shock   
..         ...        ...                       ...              ...   
177     Q05FW6       Unch  Uncharacterized protein   Uncharacterized   
178     Q05FX4       Unch  Uncharacterized protein   Uncharacterized   
179     Q05FX6       Unch  Uncharacterized protein   Uncharacterized   
180     Q05FY0       Unch  Uncharacterized protein   Uncharacterized   
181     Q05FY4       Unch  Uncharacterized protein   Uncharacterized   

    Family_Abbr  
0          CHAP  
1          CHAP  
2          CHAP  
3          CHAP  
4           CSH  
..          ...  
177      

In [33]:
import re
import numpy as np
import pandas as pd



def parse_data(file_path):
    results = []
    with open(file_path, 'r') as file:
        current_result = {}
        for line in file:
            if "Align" in line:
                match = re.search(r"Align (\S+)\.pdb \d+ with (\S+)\.pdb \d+", line)
                if match:
                    current_result['name1'] = match.group(1)
                    current_result['name2'] = match.group(2)
            elif "Twists" in line:
                match = re.search(r"Score (\d+\.\d+)", line)
                if match:
                    current_result['score'] = float(match.group(1))
            elif "P-value" in line:
                match = re.search(r"P-value (\S+)", line)
                if match:
                    current_result['p-value'] = float(match.group(1))
                    results.append(current_result)
                    current_result = {}
    return pd.DataFrame(results)


def replace_uniprot_with_shortname(df, uniprot_df):
    # Merge the data with the uniprot to short name DataFrame to get corresponding names and abbreviations
    uniprot_df['Combined_Name'] = uniprot_df['Short_Name']  + '|'+ uniprot_df['Abbreviation']
    mapping = pd.Series(uniprot_df['Combined_Name'].values, index=uniprot_df['Uniprot_ID']).to_dict()
    df['name1'] = df['name1'].map(mapping).fillna(df['name1'])
    df['name2'] = df['name2'].map(mapping).fillna(df['name2'])
    return df


def initialize_similarity_matrix(proteins):
    size = len(proteins)
    sim_matrix = np.zeros((size, size))
    return sim_matrix

def fill_similarity_matrix_score(sim_matrix, proteins, df):
    protein_index = {protein: idx for idx, protein in enumerate(proteins)}
    for _, row in df.iterrows():
        if row['name1'] in protein_index and row['name2'] in protein_index:
            idx1 = protein_index[row['name1']]
            idx2 = protein_index[row['name2']]
            transformed_score = 1 - (row['score'] / 1000) #the formula that make tree more seperate
            sim_matrix[idx1, idx2] = transformed_score
            sim_matrix[idx2, idx1] = transformed_score
            # sim_matrix[idx1, idx2] = row['score']
            # sim_matrix[idx2, idx1] = row['score']
    #np.fill_diagonal(sim_matrix, 0)  # Diagonal elements are 0 (self-similarity)
    np.fill_diagonal(sim_matrix, 0)
    #return sim_matrix
    return sim_matrix

def fill_similarity_matrix_p_value(sim_matrix, proteins, df):
    protein_index = {protein: idx for idx, protein in enumerate(proteins)}
    for _, row in df.iterrows():
        if row['name1'] in protein_index and row['name2'] in protein_index:
            idx1 = protein_index[row['name1']]
            idx2 = protein_index[row['name2']]
            #transformed_score = np.log10(row['p-value'] - 16 ) / 16
            #sim_matrix[idx1, idx2] = transformed_score
            #sim_matrix[idx2, idx1] = transformed_score
            sim_matrix[idx1, idx2] = row['p-value']
            sim_matrix[idx2, idx1] = row['p-value']
    np.fill_diagonal(sim_matrix, 0)  # Diagonal elements are 0 (self-similarity)
    return sim_matrix


def format_and_save_matrix(sim_matrix, proteins, file_path):
    """Formats the similarity matrix and saves it to a file."""
    num_proteins = len(proteins)
    with open(file_path, 'w') as file:
        # Write the number of proteins at the top of the file
        file.write(f"{num_proteins}\n")
        for i, protein in enumerate(proteins):
            # Format the protein name to be left-aligned with a fixed width for alignment
            print(protein)
            formatted_name = f"{protein:<5}"
            # Format the similarity scores, ensuring they have a fixed number of decimal places
            # and are evenly spaced for readability
            values = '  '.join(f"{value:.12f}" for value in sim_matrix[i])
            # Write each line as 'protein_name scores...'
            file.write(f"{formatted_name} {values}\n")


def main():
    # Example of calling the function
    file_path = "allpair3.aln"
    df = parse_data(file_path)
    uniprot_to_shortname_df = pd.read_csv('CARSONELLA LIST.csv', header=None, names=['Uniprot_ID', 'Short_Name', 'Full_Name', 'Family','Abbreviation'])
    # Add abbreviation column
    uniprot_to_shortname_df['Abbreviation'] = uniprot_to_shortname_df['Family'].map(family_to_abbreviation)
    #df = replace_uniprot_with_shortname(df, uniprot_to_shortname_df)
    print(df)
    # Get a list of unique protein names
    proteins = pd.concat([df['name1'], df['name2']]).unique()
    print(proteins)
    sim_matrix = initialize_similarity_matrix(proteins)
    sim_matrix_score = fill_similarity_matrix_score(sim_matrix, proteins, df)
    sim_matrix = initialize_similarity_matrix(proteins)
    sim_matrix_p_value = fill_similarity_matrix_p_value(sim_matrix, proteins, df)

    # Example of saving the matrix
    format_and_save_matrix(sim_matrix_score, proteins, "long_similarity_matrix_score.txt")
    format_and_save_matrix(sim_matrix_p_value, proteins, "long_similarity_matrix_p_value.txt")

if __name__ == "__main__":
    main()



        name1   name2    score   p-value
0      Q05FT9  Q05FT9  1584.00  0.000000
1      Q05FT9  Q05FH9   389.15  0.338000
2      Q05FT9  Q05FK5   169.34  0.484000
3      Q05FT9  Q05FL5   262.64  0.141000
4      Q05FT9  Q05FN2   273.61  0.116000
...       ...     ...      ...       ...
33099  Q05FY7  Q05FY0    22.07  0.987000
33100  Q05FY7  Q05FY2   337.03  0.000238
33101  Q05FY7  Q05FY4   155.39  0.000048
33102  Q05FY7  Q05FY5   316.00  0.000710
33103  Q05FY7  Q05FY7   672.00  0.000000

[33104 rows x 4 columns]
['Q05FT9' 'Q05FH9' 'Q05FK5' 'Q05FL5' 'Q05FN2' 'Q05FN6' 'Q05FQ7' 'Q05FR3'
 'Q05FR8' 'Q05FX8' 'Q05FY1' 'Q05FY3' 'Q05FY9' 'Q05FG1' 'Q05FG5' 'Q05FH1'
 'Q05FH2' 'Q05FH4' 'Q05FH8' 'Q05FI0' 'Q05FI1' 'Q05FI2' 'Q05FI3' 'Q05FI6'
 'Q05FI7' 'Q05FI8' 'Q05FI9' 'Q05FJ0' 'Q05FJ1' 'Q05FJ3' 'Q05FJ5' 'Q05FJ6'
 'Q05FJ9' 'Q05FK2' 'Q05FK3' 'Q05FK4' 'Q05FK8' 'Q05FL1' 'Q05FL2' 'Q05FL3'
 'Q05FL9' 'Q05FM0' 'Q05FM2' 'Q05FM5' 'Q05FM6' 'Q05FM7' 'Q05FM8' 'Q05FM9'
 'Q05FN0' 'Q05FN1' 'Q05FN3' 'Q05FP0' 'Q05FP2

In [35]:
import re
import numpy as np
import pandas as pd



def parse_data(file_path):
    results = []
    with open(file_path, 'r') as file:
        current_result = {}
        for line in file:
            if "Align" in line:
                match = re.search(r"Align (\S+)\.pdb \d+ with (\S+)\.pdb \d+", line)
                if match:
                    current_result['name1'] = match.group(1)
                    current_result['name2'] = match.group(2)
            elif "Twists" in line:
                match = re.search(r"Score (\d+\.\d+)", line)
                if match:
                    current_result['score'] = float(match.group(1))
            elif "P-value" in line:
                match = re.search(r"P-value (\S+)", line)
                if match:
                    current_result['p-value'] = float(match.group(1))
                    results.append(current_result)
                    current_result = {}
    return pd.DataFrame(results)


def replace_uniprot_with_shortname(proteins, uniprot_df):
    """Maps UniProt IDs to combined names using a DataFrame."""
    uniprot_df['Combined_Name'] = uniprot_df['Short_Name'] + '|' + uniprot_df['Abbreviation']
    mapping = pd.Series(uniprot_df['Combined_Name'].values, index=uniprot_df['Uniprot_ID']).to_dict()
    return [mapping.get(protein, protein) for protein in proteins]


def initialize_similarity_matrix(proteins):
    size = len(proteins)
    sim_matrix = np.zeros((size, size))
    return sim_matrix

def fill_similarity_matrix_score(sim_matrix, proteins, df):
    protein_index = {protein: idx for idx, protein in enumerate(proteins)}
    for _, row in df.iterrows():
        if row['name1'] in protein_index and row['name2'] in protein_index:
            idx1 = protein_index[row['name1']]
            idx2 = protein_index[row['name2']]
            transformed_score = 1 - (row['score'] / 1000) #the formula that make tree more seperate
            sim_matrix[idx1, idx2] = transformed_score
            sim_matrix[idx2, idx1] = transformed_score
            # sim_matrix[idx1, idx2] = row['score']
            # sim_matrix[idx2, idx1] = row['score']
    #np.fill_diagonal(sim_matrix, 0)  # Diagonal elements are 0 (self-similarity)
    np.fill_diagonal(sim_matrix, 0)
    #return sim_matrix
    return sim_matrix

def fill_similarity_matrix_p_value(sim_matrix, proteins, df):
    protein_index = {protein: idx for idx, protein in enumerate(proteins)}
    for _, row in df.iterrows():
        if row['name1'] in protein_index and row['name2'] in protein_index:
            idx1 = protein_index[row['name1']]
            idx2 = protein_index[row['name2']]
            #transformed_score = np.log10(row['p-value'] - 16 ) / 16
            #sim_matrix[idx1, idx2] = transformed_score
            #sim_matrix[idx2, idx1] = transformed_score
            sim_matrix[idx1, idx2] = row['p-value']
            sim_matrix[idx2, idx1] = row['p-value']
    np.fill_diagonal(sim_matrix, 0)  # Diagonal elements are 0 (self-similarity)
    return sim_matrix


def format_and_save_matrix(sim_matrix, proteins, file_path):
    """Formats the similarity matrix and saves it to a file."""
    num_proteins = len(proteins)
    with open(file_path, 'w') as file:
        # Write the number of proteins at the top of the file
        file.write(f"{num_proteins}\n")
        for i, protein in enumerate(proteins):
            # Format the protein name to be left-aligned with a fixed width for alignment
            print(protein)
            formatted_name = f"{protein:<5}"
            # Format the similarity scores, ensuring they have a fixed number of decimal places
            # and are evenly spaced for readability
            values = '  '.join(f"{value:.12f}" for value in sim_matrix[i])
            # Write each line as 'protein_name scores...'
            file.write(f"{formatted_name} {values}\n")


def main():
    # Example of calling the function
    file_path = "allpair3.aln"
    df = parse_data(file_path)
    uniprot_to_shortname_df = pd.read_csv('CARSONELLA LIST.csv', header=None, names=['Uniprot_ID', 'Short_Name', 'Full_Name', 'Family','Abbreviation'])
    # Add abbreviation column
    uniprot_to_shortname_df['Abbreviation'] = uniprot_to_shortname_df['Family'].map(family_to_abbreviation)
    #df = replace_uniprot_with_shortname(df, uniprot_to_shortname_df)
    print(df)
    # Get a list of unique protein names
    proteins = pd.concat([df['name1'], df['name2']]).unique()
    # Replace UniProt IDs with combined names
    proteins = replace_uniprot_with_shortname(proteins, uniprot_to_shortname_df)
    print(proteins)
    sim_matrix = initialize_similarity_matrix(proteins)
    sim_matrix_score = fill_similarity_matrix_score(sim_matrix, proteins, df)
    sim_matrix = initialize_similarity_matrix(proteins)
    sim_matrix_p_value = fill_similarity_matrix_p_value(sim_matrix, proteins, df)

    # Example of saving the matrix
    format_and_save_matrix(sim_matrix_score, proteins, "long_similarity_matrix_score.txt")
    format_and_save_matrix(sim_matrix_p_value, proteins, "long_similarity_matrix_p_value.txt")

if __name__ == "__main__":
    main()



        name1   name2    score   p-value
0      Q05FT9  Q05FT9  1584.00  0.000000
1      Q05FT9  Q05FH9   389.15  0.338000
2      Q05FT9  Q05FK5   169.34  0.484000
3      Q05FT9  Q05FL5   262.64  0.141000
4      Q05FT9  Q05FN2   273.61  0.116000
...       ...     ...      ...       ...
33099  Q05FY7  Q05FY0    22.07  0.987000
33100  Q05FY7  Q05FY2   337.03  0.000238
33101  Q05FY7  Q05FY4   155.39  0.000048
33102  Q05FY7  Q05FY5   316.00  0.000710
33103  Q05FY7  Q05FY7   672.00  0.000000

[33104 rows x 4 columns]
['CH60|CHAP', 'RPOC|RNAS', 'RPOA|RNAS', 'KAREDUCT|METAB', 'ASPKINASE|METAB', 'ASSY|METAB', 'ASPSAdehydrog|METAB', 'PUTA|METAB', 'CLPP|METAB', 'DHQdeydrat|METAB', 'ATPsynthB|METAB', 'ATPsynthA|METAB', 'MNME|tRNAM', 'GLYtRNAlig|tRNAL', 'PEPdeform|METAB', 'CYTOoxI|METAB', 'CYTCox|METAB', nan, 'RPOB|RNAS', nan, nan, 'EFG|TRANSL', 'EFTU|TRANSL', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'ILEtRNAlig|tRNAL', 'PSUsynthD|RIB', 'ACETOL|METAB', 'HomoSerdehyd|ME

In [39]:
import re
import numpy as np
import pandas as pd

def parse_data(file_path):
    results = []
    with open(file_path, 'r') as file:
        current_result = {}
        for line in file:
            if "Align" in line:
                match = re.search(r"Align (\S+)\.pdb \d+ with (\S+)\.pdb \d+", line)
                if match:
                    current_result['name1'] = match.group(1)
                    current_result['name2'] = match.group(2)
            elif "Twists" in line:
                match = re.search(r"Score (\d+\.\d+)", line)
                if match:
                    current_result['score'] = float(match.group(1))
            elif "P-value" in line:
                match = re.search(r"P-value (\S+)", line)
                if match:
                    current_result['p-value'] = float(match.group(1))
                    results.append(current_result)
                    current_result = {}
    return pd.DataFrame(results)



def initialize_similarity_matrix(proteins):
    size = len(proteins)
    sim_matrix = np.zeros((size, size))
    return sim_matrix

def fill_similarity_matrix_score(sim_matrix, proteins, df):
    protein_index = {protein: idx for idx, protein in enumerate(proteins)}
    for _, row in df.iterrows():
        if row['name1'] in protein_index and row['name2'] in protein_index:
            idx1 = protein_index[row['name1']]
            idx2 = protein_index[row['name2']]
            #transformed_score = 1 - (row['score'] / 1000) #the formula that make tree more seperate
            #sim_matrix[idx1, idx2] = transformed_score
            #sim_matrix[idx2, idx1] = transformed_score
            sim_matrix[idx1, idx2] = row['score']
            sim_matrix[idx2, idx1] = row['score']
    np.fill_diagonal(sim_matrix, 0)  # Diagonal elements are 0 (self-similarity)
    return sim_matrix

def fill_similarity_matrix_p_value(sim_matrix, proteins, df):
    protein_index = {protein: idx for idx, protein in enumerate(proteins)}
    for _, row in df.iterrows():
        if row['name1'] in protein_index and row['name2'] in protein_index:
            idx1 = protein_index[row['name1']]
            idx2 = protein_index[row['name2']]
            #transformed_score = (np.log10(row['p-value'])  - 16 )/ 16
            #sim_matrix[idx1, idx2] = transformed_score
            #sim_matrix[idx2, idx1] = transformed_score
            sim_matrix[idx1, idx2] = row['p-value']
            sim_matrix[idx2, idx1] = row['p-value']
    np.fill_diagonal(sim_matrix, 0)  # Diagonal elements are 0 (self-similarity)
    return sim_matrix


def format_and_save_matrix(sim_matrix, proteins, file_path):
    """Formats the similarity matrix and saves it to a file."""
    num_proteins = len(proteins)
    with open(file_path, 'w') as file:
        # Write the number of proteins at the top of the file
        file.write(f"{num_proteins}\n")
        for i, protein in enumerate(proteins):
            # Format the protein name to be left-aligned with a fixed width for alignment
            formatted_name = f"{protein:<11}"
            # Format the similarity scores, ensuring they have a fixed number of decimal places
            # and are evenly spaced for readability
            values = '  '.join(f"{value:.12f}" for value in sim_matrix[i])
            # Write each line as 'protein_name scores...'
            file.write(f"{formatted_name} {values}\n")


def main():
    # Example of calling the function
    file_path = "allpair3.aln"
    df = parse_data(file_path)
    # Get a list of unique protein names
    proteins = pd.concat([df['name1'], df['name2']]).unique()
    sim_matrix = initialize_similarity_matrix(proteins)
    sim_matrix_score = fill_similarity_matrix_score(sim_matrix, proteins, df)
    sim_matrix = initialize_similarity_matrix(proteins)
    sim_matrix_p_value = fill_similarity_matrix_p_value(sim_matrix, proteins, df)

    # Example of saving the matrix
    format_and_save_matrix(sim_matrix_score, proteins, "similarity_matrix_score.txt")
    format_and_save_matrix(sim_matrix_p_value, proteins, "similarity_matrix_p_value.txt")

if __name__ == "__main__":
    main()



    Uniprot_ID Short_Name                 Full_name           Family  \
0       Q05FT9       CH60         Chaperonin GroEL         Chaperone   
1       Q05FT8     CHAP10        10 kDa chaperonin         Chaperone   
2       Q05FS9   ChapGRPE   Chaperone protein GrpE         Chaperone   
3       Q05FS8       DNAK   Chaperone protein DnaK         Chaperone   
4       Q05FF9        CSP       Cold shock protein        Cold shock   
..         ...        ...                       ...              ...   
177     Q05FW6       Unch  Uncharacterized protein   Uncharacterized   
178     Q05FX4       Unch  Uncharacterized protein   Uncharacterized   
179     Q05FX6       Unch  Uncharacterized protein   Uncharacterized   
180     Q05FY0       Unch  Uncharacterized protein   Uncharacterized   
181     Q05FY4       Unch  Uncharacterized protein   Uncharacterized   

    Family_Abbr  
0          CHAP  
1          CHAP  
2          CHAP  
3          CHAP  
4           CSH  
..          ...  
177      