In [1]:
from Bio import SeqIO
import pandas as pd

def parse_fasta(file_path):

    short_seqs = []
    long_seqs = []
    
    for record in SeqIO.parse(file_path, "fasta"):
        sequence_id = record.id.split('|')[1]  
        sequence_length = len(record.seq)
        
        if sequence_length < 100:
            short_seqs.append(sequence_id)
        else:
            long_seqs.append(sequence_id)
    
    return pd.DataFrame(short_seqs, columns=["Sequence ID"]), pd.DataFrame(long_seqs, columns=["Sequence ID"])

df_short, df_long = parse_fasta("ruddii_proteome.fasta")




In [2]:
df_short

Unnamed: 0,Sequence ID
0,Q05FI8
1,Q05FJ5
2,Q05FK8
3,Q05FM5
4,Q05FT8
5,Q05FW4
6,Q05FY6
7,Q05FF9
8,Q05FH3
9,Q05FI4


In [3]:

df_short.to_csv("short_sequences.fasta", index=False)
df_long.to_csv("long_sequences.fasta", index=False)

In [4]:
pip install bio

Collecting bio
  Downloading bio-1.6.2-py3-none-any.whl (278 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting gprofiler-official
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting biopython>=1.80
  Downloading biopython-1.84-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting biothings-client>=0.2.6
  Downloading biothings_client-0.3.1-py2.py3-none-any.whl (29 kB)
Installing collected packages: biopython, gprofiler-official, biothings-client, mygene, bio
Successfully installed bio-1.6.2 biopython-1.84 biothings-client-0.3.1 gprofiler-official-1.0.0 mygene-3.2.2
Note: you may need to restart the kernel to use updated pac

In [28]:
# give the similarity score the formula , because analysis separate aminno
import pandas as pd
filepath = 'short_similarity_matrix_score.txt'
with open(filepath, 'r') as file:
    protein_count = file.readline().strip()


df = pd.read_csv(filepath, delim_whitespace=True, header=None, skiprows=1)
df.columns = ['Protein'] + [f'Score_{i}' for i in range(1, len(df.columns))]


for i in range(1, len(df.columns)):

    df.iloc[:, i] = 1 - df.iloc[:, i] / 1000
    df.iloc[i-1, i] = 0.00000  


formatted_filepath = filepath.replace('.txt', '_formatted.txt')
with open(formatted_filepath, 'w') as f:

    f.write(f"{protein_count}\n")

    for index, row in df.iterrows():
        protein = f"{row['Protein']:<13}"
        scores = ' '.join([f"{score:.11f}" for score in row[1:]])
        f.write(f"{protein} {scores}\n")




In [29]:
df.iloc[1,2]

0.0

In [30]:
df

Unnamed: 0,Protein,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,Score_9,...,Score_24,Score_25,Score_26,Score_27,Score_28,Score_29,Score_30,Score_31,Score_32,Score_33
0,Q05FI8,0.0,0.95715,0.94917,0.94624,0.95981,0.95254,0.97136,0.95717,0.96965,...,0.95514,0.96208,0.976,0.94812,0.95523,0.95646,0.95416,0.97601,0.95468,0.97601
1,Q05FJ5,0.95715,0.0,0.97704,0.97605,0.97642,0.95067,0.90023,0.97484,0.8961,...,0.96856,0.9194,0.91815,0.93009,0.95305,0.94266,0.97213,0.91674,0.977,0.91497
2,Q05FK8,0.94917,0.97704,0.0,0.95565,0.94146,0.95389,0.98276,0.95871,0.97854,...,0.95751,0.97975,0.98818,0.95472,0.9594,0.9554,0.95427,1.0,0.94522,1.0
3,Q05FM5,0.94624,0.97605,0.95565,0.0,0.94798,0.94508,0.98207,0.94499,0.97975,...,0.94901,0.98008,0.98915,0.95632,0.94601,0.96092,0.95374,0.98818,0.94441,0.98901
4,Q05FT8,0.95981,0.97642,0.94146,0.94798,0.0,0.96053,0.97971,0.93806,0.97734,...,0.95623,0.97825,0.98364,0.96095,0.95705,0.96341,0.92303,0.98688,0.93277,0.98743
5,Q05FW4,0.95254,0.95067,0.95389,0.94508,0.96053,0.0,0.95255,0.95658,0.95068,...,0.95585,0.95266,0.94916,0.94109,0.94774,0.95373,0.95406,0.95378,0.94913,0.9539
6,Q05FY6,0.97136,0.90023,0.98276,0.98207,0.97971,0.95255,0.0,0.98208,0.83014,...,0.96535,0.85866,0.88751,0.92329,0.95296,0.94856,0.97602,0.88731,0.983,0.85621
7,Q05FF9,0.95717,0.97484,0.95871,0.94499,0.93806,0.95658,0.98208,0.0,0.97772,...,0.9566,0.97984,0.98388,0.95642,0.95543,0.96103,0.90169,0.98898,0.95383,0.98942
8,Q05FH3,0.96965,0.8961,0.97854,0.97975,0.97734,0.95068,0.83014,0.97772,0.0,...,0.96785,0.8794,0.90406,0.91693,0.95278,0.93609,0.97602,0.90406,0.97907,0.8622
9,Q05FI4,0.95764,0.94355,0.94497,0.95823,0.94868,0.9532,0.94325,0.95783,0.93596,...,0.94041,0.94554,0.95219,0.90319,0.91888,0.92237,0.95673,0.95215,0.95265,0.95218
