In [1]:
# Importing the libraries
from functions import *

## 1. HMM: Data preparation
We use the XML file with the BLAST results as an input. These files contain the information necessary to reproduce the MSA BLAST alignment: the query and subject IDs and sequences, the sequences lengths and positions (start-end).

Here the work is performed on the one fasta file: `curated_uniprot.fasta_75`.

In [743]:
# Open XML file as a dataframe
input_file = '{}/databases/uniprot/curated_uniprot.fasta_75'.format(directory) # change the file name if necessary
df = blast_parser(input_file)
print(f"The number of instances: {len(df)}")
df.head(5)

The number of instances: 8656


Unnamed: 0,query_id,subject_id,query_len,hsp_len,query_seq,match_seq,subject_seq,query_start,query_end,subject_start,subject_end,identity,positive,gaps,eval,bit_score,count
0,Q16620,A0A2R9BM51,822,822,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,1,822,1,822,822,822,0,0.0,4458.0,200
1,Q16620,A0A4X2LP40,822,824,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,M SW + HGP MARL GFCWLV+ FWR + ACPTSC CS +RIWCS...,MLSWKKCHGPGMARLLGFCWLVLIFWRGSQACPTSCTCSTTRIWCS...,1,822,1,824,689,745,2,0.0,3749.0,200
2,Q16620,A0A4X2LDU8,822,822,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,M SW + HGP MARL GFCWLV+ FWR + ACPTSC CS +RIWCS...,MLSWKKCHGPGMARLLGFCWLVLIFWRGSQACPTSCTCSTTRIWCS...,1,822,1,821,689,745,1,0.0,3756.0,200
3,Q16620,A0A6P5IKH1,822,824,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,M SW + HGP MARL GFCWLV+ FWR + ACPTSC CS +RIWCS...,MLSWKKCHGPGMARLLGFCWLVLIFWRGSQACPTSCTCSTTRIWCS...,1,822,1,824,691,745,2,0.0,3764.0,200
4,Q16620,A0A7J7UQA2,822,741,TSCKCSASRIWCSDPSPGIVAFPRLEPNSVDPENITEIFIANQKRL...,+SCKCSASRIWCSDP PGI+AFPRLEPN++DPENITEI+IANQKRL...,SSCKCSASRIWCSDPIPGIMAFPRLEPNTIDPENITEIYIANQKRL...,34,774,2,739,710,724,3,0.0,3776.0,200


To extract the information regarding the disordered regions we use the `curated.mjson` file which contains the infromation from the DisProt database. As well as in the `curated_uniprot.fasta`, there are the list of Uniprot IDs. There are also start and end position of disordered regions. We will use this database to filter out only the positions containing the disordered regions.

In [744]:
# Create a dataframe with Disprot instances from curated.mjson database
data = list()

with open('curated.mjson', 'r') as file:
    for line in file:
        obj = json.loads(line)
        rows = json_parser(obj)
        data.extend(rows)

curated_disprot = pd.DataFrame(data)

# Calculate the length of disordered regions
curated_disprot['length'] = curated_disprot['end'] - curated_disprot['start'] + 1
curated_disprot = curated_disprot[(curated_disprot['feature'] == 'disorder') & (curated_disprot['source'] == 'disprot')]
curated_disprot['region'] = curated_disprot.apply(lambda row: f"{row['acc']}_{row['start']}-{row['end']}", axis=1)
curated_disprot.to_csv("curated_disprot.csv", index=False)

print(f"The number of instances in the Curated Disprot database: {len(curated_disprot)}")
curated_disprot.head()

The number of instances in the Curated Disprot database: 3151


Unnamed: 0,acc,evidence,feature,source,start,end,length,region
6927,P03265,curated,disorder,disprot,294,334,41,P03265_294-334
6928,P03265,curated,disorder,disprot,454,464,11,P03265_454-464
6929,P49913,curated,disorder,disprot,134,170,37,P49913_134-170
6930,P03045,curated,disorder,disprot,1,107,107,P03045_1-107
6931,P00004,curated,disorder,disprot,1,105,105,P00004_1-105


In [745]:
# Keep only disordered regions in the initial dataframe filtering with curated_disordered dataframe
"""
Using `acc` column from `curated_disprot`, 
select only the instances with the disordered regions 
from the `curated_uniprot.fasta`
"""
disordered = df[df['query_id'].isin(curated_disprot['acc'])]
disordered.to_csv("disordered_df.csv", index=False)

print(f"The number of rows with the disordered regions: {len(disordered)}")
disordered.head()

The number of rows with the disordered regions: 2887


Unnamed: 0,query_id,subject_id,query_len,hsp_len,query_seq,match_seq,subject_seq,query_start,query_end,subject_start,subject_end,identity,positive,gaps,eval,bit_score,count
200,Q9H832,A0A6J2FM24,354,356,MAESPTEEAATA--GAGAAGPGASSVAGVVGVSGSGGGFGPPFLPD...,MAESPTEEAATA GAGAAGPGAS V GVVGVSGSG FGPPFLPD...,MAESPTEEAATATAGAGAAGPGASGVTGVVGVSGSG--FGPPFLPD...,1,354,1,354,350,350,4,0.0,1851.0,200
201,Q9H832,A0A3Q7W6Y2,354,356,MAESPTEEAATA--GAGAAGPGASSVAGVVGVSGSGGGFGPPFLPD...,MAESPTEEAATA GAGA GPGAS VAGVVGVSGSG FGPPFLPD...,MAESPTEEAATATAGAGATGPGASGVAGVVGVSGSG--FGPPFLPD...,1,354,1,354,350,350,4,0.0,1851.0,200
202,Q9H832,A0A2U3VK69,354,356,MAESPTEEAATA--GAGAAGPGASSVAGVVGVSGSGGGFGPPFLPD...,MAESPTEEAATA GAGAAGPGAS V GVVGVSGSG FGPPFLPD...,MAESPTEEAATATAGAGAAGPGASGVTGVVGVSGSG--FGPPFLPD...,1,354,1,354,350,350,4,0.0,1851.0,200
203,Q9H832,A0A2Y9JVH5,354,358,MAESPTEEAATA----GAGAAGPGASSVAGVVGVSGSGGGFGPPFL...,MAESPTEEAATA GAGAAGPGAS VAGVVGVSGSG FGPPFL...,MAESPTEEAATATATAGAGAAGPGASGVAGVVGVSGSG--FGPPFL...,1,354,1,356,351,351,6,0.0,1854.0,200
204,Q9H832,A0A8C7ALE4,354,358,MAESPTEEAATA----GAGAAGPGASSVAGVVGVSGSGGGFGPPFL...,MAESPTEEAATA GAGAAGPGAS VAGVVGVSGSG FGPPFL...,MAESPTEEAATATATAGAGAAGPGASGVAGVVGVSGSG--FGPPFL...,1,354,1,356,351,351,6,0.0,1854.0,200


In [760]:
dis_regs = set()

for i, row in disordered.iterrows():
    dis_id = row[0]
    matching_row = curated_disprot[curated_disprot['acc'] == dis_id]
    if not matching_row.empty:
        region = matching_row['region']
        dis_regs.update(region)

print(dis_regs, len(dis_regs))

{'Q9H832_327-354', 'Q9CXY6_29-44', 'Q93KQ4_51-81', 'Q8WUG5_1-105', 'Q99967_220-269', 'Q5T4W7_108-120', 'Q9H0E2_1-53', 'Q9BYF1_769-805', 'Q9BYI3_149-253', 'Q9CXY6_347-390', 'Q86FP8_20-80', 'Q8K4J6_111-142', 'Q8IW19_399-420', 'Q84852_17-299', 'Q8R464_25-120', 'Q9H832_1-99', 'Q8K4J6_67-98', 'Q8K4J6_155-186', 'Q5VZK9_1005-1020', 'Q8IU57_270-299', 'Q8IW19_450-511'} 21


In [1847]:
# Define an array of disordered regions ids
disprot_ids = disordered["query_id"].unique()
# disprot_ids = dis_regs

# Dropdown list of Uniprot query IDs for disordered regions
output = widgets.Select(options=disprot_ids,
    rows=10,
    description='Uniprot ID: ',
    layout={'width': 'max-content'},
    disabled=False)

display(output)

Select(description='Uniprot ID: ', layout=Layout(width='max-content'), options=('Q9H832', 'Q8IW19', 'Q99967', …

In [2230]:
# Check if there are disordered regions for a particular query ID
id_dis = output.value
i = 1 # change to the necessary region
# store the data
%store id_dis i
selected_dis = disordered[disordered["query_id"].isin([id_dis])]
# selected_dis.head()

Stored 'id_dis' (str)
Stored 'i' (int)


In [2231]:
# Filter the curated_disprot dataframe based on selected ID
curated_query = curated_disprot[curated_disprot['acc'] == id_dis]
# for id_d in disprot_ids:
curated_query

Unnamed: 0,acc,evidence,feature,source,start,end,length,region
7821,Q86FP8,curated,disorder,disprot,20,80,61,Q86FP8_20-80


In [2232]:
# for id_d in disprot_ids:
#     curated_query = curated_disprot[curated_disprot['region'] == id_d]

## 2. hmmbuild
We build an HMM of each disordered region, using trimmed MSA as an input.

In [2233]:
# Set the paths to HMM and MSA files
hmm_file = f'{directory}/results/hmms/hmmbuild/{id_dis}_{i}.hmm'
align_file = f'{directory}/results/alignments/output_files/disordered/{id_dis}_{i}.fasta'

In [2234]:
# for id_dis in disprot_ids:
#     hmm_file = f'{directory}/results/hmms/hmmbuild/{id_dis}.hmm'
#     align_file = f'{directory}/results/alignments/output_files/disordered/{id_dis}_{i}.fasta'
#     print(hmm_file, align_file)

In [2235]:
# # Build HMM
# !hmmbuild {hmm_file} {align_file}

## 3. hmmsearch

After building the model, our objective is to assess if overlaps with the profiles in Reference Proteome 15% exist and to enrich the model by utilizing this database. We generate dataframes containing the most significant sequences, with a default E-value threshold of 0.01.

In [2236]:
# # Copy the HMM file to remote computer to perform HMM search on a cluster
# !scp {directory}/results/hmms/hmmbuild/{id_dis}_{i}.hmm {name}@{server}:~/{id_dis}_{i}.hmm # one region

In [2237]:
# # hmmsearch against Reference Proteome 15%
# !ssh {name}@{server} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {id_dis}_{i}.hmm /db/rp/rp-seqs-15.fasta.gz > hmmsearch_rp_15_{id_dis}_{i}.txt"

In [2238]:
# # hmmsearch against Reference Proteome 75%
# !ssh {name}@{server} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {id_dis}_{i}.hmm /db/rp/rp-seqs-75.fasta.gz > hmmsearch_rp_75_{id_dis}_{i}.txt"

In [2239]:
# # Copy results (RP 15%) to the local folder
# !scp {name}@{server}:~/hmmsearch_rp_15_{id_dis}_{i}.txt {directory}/results/hmms/hmmsearch/

In [2240]:
# # Copy results (RP 75%) to the local folder
# !scp {name}@{server}:~/hmmsearch_rp_75_{id_dis}_{i}.txt {directory}/results/hmms/hmmsearch/

In [2241]:
# # Remove the files from the remote folder
# !ssh {name}@{server} "rm ~/{id_dis}_{i}.hmm"
# !ssh {name}@{server} "rm ~/hmmsearch_rp_15_{id_dis}_{i}.txt"
# !ssh {name}@{server} "rm ~/hmmsearch_rp_75_{id_dis}_{i}.txt"

### 3.1 hmmsearch results preprocessing
### Reference Proteome 15%

In [2242]:
# 1. Make a dataframe with the statistics - the first table of the hmmsearch output
stats_rp_15 = process_hmmsearch_file(f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_15_{id_dis}_{i}.txt")
# stats_rp_15.head()

In [2243]:
# 2. Create a dataframe with the extracted regions from HMM - from the rest of the file
hmm_rp_15 = f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_15_{id_dis}_{i}.txt"
hmmsearch_regions_rp_15 = extract_table_from_output(hmm_rp_15)
# hmmsearch_regions_rp_15.head()

In [2244]:
# 3. Combine the results into a dataframe and save the file
hmmsearch_results_rp_15 = pd.merge(stats_rp_15, hmmsearch_regions_rp_15, left_on="Sequence", right_on="id", how="inner")
hmmsearch_results_rp_15 = hmmsearch_results_rp_15.drop(columns=["Description", "id"])
hmmsearch_results_rp_15.to_csv(f"{directory}/results/hmms/hmmsearch/stats/stats_rp_15_{id_dis}_{i}.csv", index=False)
print(f"Number of hmmsearch hits for the {i} disordered region of the {id_dis} protein (RP 15%):", len(hmmsearch_results_rp_15))
%store hmmsearch_results_rp_15
hmmsearch_results_rp_15.head()

Number of hmmsearch hits for the 1 disordered region of the Q86FP8 protein (RP 15%): 295
Stored 'hmmsearch_results_rp_15' (DataFrame)


Unnamed: 0,E-value,score,bias,E-value.1,score.1,bias.1,exp,N,Sequence,ali_from,ali_to,ali_length
0,2.4e-20,81.8,0.0,5.2e-20,80.7,0.0,1.6,1,A9TSC0,310,360,51
1,1.9e-18,75.8,0.0,3.9e-18,74.8,0.0,1.5,1,A0A384L621,284,337,54
2,1.9e-18,75.8,0.0,3.9e-18,74.8,0.0,1.5,1,Q9LIP9,284,337,54
3,2.1e-17,72.4,0.0,5.2e-17,71.1,0.0,1.6,1,A0A834T8N5,340,389,50
4,2.5e-17,72.1,0.0,5.7e-17,71.0,0.0,1.6,1,A0A6A6KBW9,271,323,53


In [2245]:
subject_ids_rp_15 = hmmsearch_results_rp_15.iloc[:, 8].unique()
print(f"The number of unique IDs for the {id_dis} MSA:", len(subject_ids_rp_15))

The number of unique IDs for the Q86FP8 MSA: 295


### Reference Proteome 75%

In [2246]:
# 1. Make a dataframe with the statistics - the first table of the hmmsearch output
stats_rp_75 = process_hmmsearch_file(f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_75_{id_dis}_{i}.txt")
# stats_rp_75.head()

In [2247]:
# 2. Create a dataframe with the extracted regions from HMM - from the rest of the file
hmm_rp_75 = f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_75_{id_dis}_{i}.txt"
hmmsearch_regions_rp_75 = extract_table_from_output(hmm_rp_75)
# hmmsearch_regions_rp_75.head()

In [2248]:
# 3. Combine the results into a dataframe and save the file
hmmsearch_results_rp_75 = pd.merge(stats_rp_75, hmmsearch_regions_rp_75, left_on="Sequence", right_on="id", how="inner")
hmmsearch_results_rp_75 = hmmsearch_results_rp_75.drop(columns=["Description", "id"])
hmmsearch_results_rp_75.to_csv(f"{directory}/results/hmms/hmmsearch/stats/stats_rp_75_{id_dis}_{i}.csv", index=False)
print(f"Number of hmmsearch hits for the {i} disordered region of the {id_dis} protein (RP 75%):", len(hmmsearch_results_rp_75))
%store hmmsearch_results_rp_75
hmmsearch_results_rp_75.head()

Number of hmmsearch hits for the 1 disordered region of the Q86FP8 protein (RP 75%): 1339
Stored 'hmmsearch_results_rp_75' (DataFrame)


Unnamed: 0,E-value,score,bias,E-value.1,score.1,bias.1,exp,N,Sequence,ali_from,ali_to,ali_length
0,7e-33,125.0,0.2,1.6e-13,63.0,0.0,2.5,2,A0A5J9WTQ4,286,340,55
1,2.7e-20,84.6,0.0,5.3e-20,83.7,0.0,1.5,1,A0A0D3D564,283,339,57
2,5.8e-20,83.6,0.0,1.2e-19,82.6,0.0,1.5,1,A0A397YI80,277,333,57
3,5.8e-20,83.6,0.0,1.2e-19,82.6,0.0,1.5,1,M4DEU0,277,333,57
4,7.7e-20,83.2,0.0,1.7e-19,82.1,0.0,1.5,1,M4CCJ6,283,339,57


In [2249]:
subject_ids_rp_75 = hmmsearch_results_rp_75.iloc[:, 8].unique()
print(f"The number of unique IDs for the {id_dis} MSA:", len(subject_ids_rp_75))

The number of unique IDs for the Q86FP8 MSA: 1339


## 4. Pfam
### 4.1 Pfam results preprocessing

In [2250]:
# # Copy the files with the statistics to the remote computer
# !scp {directory}/results/hmms/hmmsearch/stats/stats_rp_15_{id_dis}_{i}.csv {name}@{server}:~/stats/stats_rp_15/stats_rp_15_{id_dis}_{i}.csv
# !scp {directory}/results/hmms/hmmsearch/stats/stats_rp_75_{id_dis}_{i}.csv {name}@{server}:~/stats/stats_rp_75/stats_rp_75_{id_dis}_{i}.csv

In [2251]:
# # Check the overlaps with Interpro domains (RP 15%)
# !ssh {name}@{server} "/home/alina/protein2ipr.py /home/alina/stats/stats_rp_15 /home/alina/filtered.tsv.gz protein2ipr_rp_15.tsv"

In [2252]:
# # Check the overlaps with Interpro domains (RP 75%)
# !ssh {name}@{server} "/home/alina/protein2ipr.py /home/alina/stats/stats_rp_75 /home/alina/filtered.tsv.gz protein2ipr_rp_75.tsv"

In [2253]:
# # Copy the files to the local folder
# !scp {name}@{server}:~/protein2ipr_rp_15.tsv {directory}/results/pfam
# !scp {name}@{server}:~/protein2ipr_rp_75.tsv {directory}/results/pfam

In [2254]:
# Filter only entries with Pfam ID and intercepting regions with the curated_disprot instances (RP 15%)
filename = f'{directory}/results/pfam/protein2ipr_rp_15.tsv'
pfam_rp_15 = pfam_processing(filename)

pfam_rp_15 = pfam_rp_15[pfam_rp_15['uniprot_id'].isin(subject_ids_rp_15)]
print(f"The number of retrieved Pfam instances for the {id_dis}_{i} protein (RP 15%): {len(pfam_rp_15)}")
# print("The number of Uniprot instances not covered by Pfam (RP 15%):", len(hmmsearch_results_rp_15) - len(pfam_rp_15))
pfam_rp_15.head()

The number of retrieved Pfam instances for the Q86FP8_1 protein (RP 15%): 556


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam
2,A0A016UWB3,PF01380,IPR001347,173,300,128
3,A0A016UWB3,PF01380,IPR001347,343,432,90
4,A0A016UWX5,PF01380,IPR001347,202,329,128
5,A0A016UWX5,PF01380,IPR001347,372,461,90
6,A0A059LFS6,PF01380,IPR001347,155,283,129


In [2255]:
# Filter only entries with Pfam ID and intercepting regions with the curated_disprot instances (RP 75%)
filename = f'{directory}/results/pfam/protein2ipr_rp_75.tsv'
pfam_rp_75 = pfam_processing(filename)

pfam_rp_75 = pfam_rp_75[pfam_rp_75['uniprot_id'].isin(subject_ids_rp_75)]
print(f"The number of retrieved Pfam instances for the {id_dis}_{i} protein (RP 75%): {len(pfam_rp_75)}")
# print("The number of Uniprot instances not covered by Pfam (RP 75%):", len(hmmsearch_results_rp_75) - len(pfam_rp_75))
pfam_rp_75.head()

The number of retrieved Pfam instances for the Q86FP8_1 protein (RP 75%): 2565


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam
0,A0A010QN55,PF01380,IPR001347,381,508,128
1,A0A010QN55,PF01380,IPR001347,553,682,130
2,A0A016UWB3,PF01380,IPR001347,173,300,128
3,A0A016UWB3,PF01380,IPR001347,343,432,90
4,A0A016UWX5,PF01380,IPR001347,202,329,128


In [2256]:
# Merge with hmmsearch result - for one MSA (RP 15%)
pfam_rp_15 = pd.merge(pfam_rp_15, hmmsearch_results_rp_15[['Sequence', 'ali_from', 'ali_to', 'ali_length']], left_on='uniprot_id', right_on='Sequence', how='left')
pfam_rp_15 = pfam_rp_15.dropna(axis=0)
pfam_rp_15['ali_from'] = pfam_rp_15['ali_from'].astype(int)
pfam_rp_15['ali_to'] = pfam_rp_15['ali_to'].astype(int)
pfam_rp_15['ali_length'] = pfam_rp_15['ali_length'].astype(int)
pfam_rp_15 = pfam_rp_15.drop(columns='Sequence')
# pfam_rp_15.to_csv(f'results/pfam/pfam_overlap/pfam_overlap_{id_dis}.csv', index=False)
# print(f"The number of overlaps for {id_dis}_{i} (RP 15%):", len(pfam_rp_15))
pfam_rp_15.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length
0,A0A016UWB3,PF01380,IPR001347,173,300,128,104,155,52
1,A0A016UWB3,PF01380,IPR001347,343,432,90,104,155,52
2,A0A016UWX5,PF01380,IPR001347,202,329,128,133,184,52
3,A0A016UWX5,PF01380,IPR001347,372,461,90,133,184,52
4,A0A059LFS6,PF01380,IPR001347,155,283,129,92,140,49


In [2257]:
# Merge with hmmsearch result - for one MSA (RP 75%)
pfam_rp_75 = pd.merge(pfam_rp_75, hmmsearch_results_rp_75[['Sequence', 'ali_from', 'ali_to', 'ali_length']], left_on='uniprot_id', right_on='Sequence', how='left')
pfam_rp_75 = pfam_rp_75.dropna(axis=0)
pfam_rp_75['ali_from'] = pfam_rp_75['ali_from'].astype(int)
pfam_rp_75['ali_to'] = pfam_rp_75['ali_to'].astype(int)
pfam_rp_75['ali_length'] = pfam_rp_75['ali_length'].astype(int)
pfam_rp_75 = pfam_rp_75.drop(columns='Sequence')
# pfam_rp_75.to_csv(f'results/pfam/pfam_overlap/pfam_overlap_{id_dis}.csv', index=False)
# print(f"The number of overlaps for {id_dis}_{i} (RP 75%):", len(pfam_rp_75))
pfam_rp_75.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length
0,A0A010QN55,PF01380,IPR001347,381,508,128,330,362,33
1,A0A010QN55,PF01380,IPR001347,553,682,130,330,362,33
2,A0A016UWB3,PF01380,IPR001347,173,300,128,104,155,52
3,A0A016UWB3,PF01380,IPR001347,343,432,90,104,155,52
4,A0A016UWX5,PF01380,IPR001347,202,329,128,133,184,52


In [2258]:
# Calculate the overlaps percentage
def pfam_hmm_overlap(row_pfam):
    start_pfam = row_pfam['start_pfam']
    end_pfam = row_pfam['end_pfam']
    start_hmm = row_pfam['ali_from']
    end_hmm = row_pfam['ali_to']
    len_pfam = row_pfam['length_pfam']
    len_hmm = row_pfam['ali_length']

    overlap_len = min(end_pfam, end_hmm) - max(start_pfam, start_hmm) + 1
    overlap_pfam = overlap_len/len_pfam * 100
    overlap_hmm = overlap_len/len_hmm * 100
    max_length = len_pfam + len_hmm - overlap_len
    non_overlap_len = max_length - overlap_len + 1

    if overlap_len > 0:
        overlap_perc = (overlap_len / max_length) * 100
    else:
        overlap_len = 0
        overlap_pfam = 0
        overlap_hmm = 0
        overlap_perc = 0

    return overlap_len, overlap_pfam, overlap_hmm, non_overlap_len, overlap_perc

In [2259]:
# Add the overlaps to the dataframe (RP 15%)
# overlap_percentages = []
overlap_pfam_hmm = []

# for index_pfam, row_pfam in pfam_rp_15.iterrows():
#     row_disprot = curated_query.iloc[i-1]
#     overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm = calculate_overlap(row_pfam, row_disprot)
#     overl_perc_pfam = round(overl_perc_pfam, 2)
#     overl_perc_hmm = round(overl_perc_hmm, 2)
#     overlap_percentages.append((overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm))
    
for index_pfam, row_pfam in pfam_rp_15.iterrows():
    overl_len, overl_pfam, overl_hmm, non_overl_len, overl_perc = pfam_hmm_overlap(row_pfam)
    overl_pfam = round(overl_pfam, 2)
    overl_hmm = round(overl_hmm, 2)
    overl_perc = round(overl_perc, 2)
    overlap_pfam_hmm.append((overl_len, overl_pfam, overl_hmm, non_overl_len, overl_perc))

# Extract overlap_pfam and overlap_hmm from the list of tuples
# pfam_rp_15['overlap_pfam_%'], pfam_rp_15['overlap_hmm_%'], pfam_rp_15['overlap_pfam_len'], pfam_rp_15['overlap_hmm_len'] = zip(*overlap_percentages)
pfam_rp_15['overl_len'], pfam_rp_15['overl_pfam'], pfam_rp_15['overl_hmm'], pfam_rp_15['non_overl_len'], pfam_rp_15['overl_perc'] = zip(*overlap_pfam_hmm)
# print("DisProt start:", curated_query['start'].iloc[i-1], "\n"
#       "DisProt end:", curated_query['end'].iloc[i-1], "\n"
#       "DisProt length:", curated_query['length'].iloc[i-1])
pfam_rp_15.to_csv(f'results/pfam/pfam_overlap/rp_15/pfam_rp_15_{id_dis}_{i}.csv', index=False)
pfam_rp_15.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overl_len,overl_pfam,overl_hmm,non_overl_len,overl_perc
0,A0A016UWB3,PF01380,IPR001347,173,300,128,104,155,52,0,0.0,0.0,215,0.0
1,A0A016UWB3,PF01380,IPR001347,343,432,90,104,155,52,0,0.0,0.0,517,0.0
2,A0A016UWX5,PF01380,IPR001347,202,329,128,133,184,52,0,0.0,0.0,215,0.0
3,A0A016UWX5,PF01380,IPR001347,372,461,90,133,184,52,0,0.0,0.0,517,0.0
4,A0A059LFS6,PF01380,IPR001347,155,283,129,92,140,49,0,0.0,0.0,207,0.0


In [2260]:
# Add the overlaps to the dataframe (RP 75%)
# overlap_percentages = []
overlap_pfam_hmm = []

# for index_pfam, row_pfam in pfam_rp_75.iterrows():
#     row_disprot = curated_query.iloc[i-1]
#     overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm = calculate_overlap(row_pfam, row_disprot)
#     overl_perc_pfam = round(overl_perc_pfam, 2)
#     overl_perc_hmm = round(overl_perc_hmm, 2)
#     overlap_percentages.append((overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm))

for index_pfam, row_pfam in pfam_rp_75.iterrows():
    overl_len, overl_pfam, overl_hmm, non_overl_len, overl_perc = pfam_hmm_overlap(row_pfam)
    overl_pfam = round(overl_pfam, 2)
    overl_hmm = round(overl_hmm, 2)
    overl_perc = round(overl_perc, 2)
    overlap_pfam_hmm.append((overl_len, overl_pfam, overl_hmm, non_overl_len, overl_perc))

# Extract overlap_pfam and overlap_hmm from the list of tuples
# pfam_rp_75['overlap_pfam_%'], pfam_rp_75['overlap_hmm_%'], pfam_rp_75['overlap_pfam_len'], pfam_rp_75['overlap_hmm_len'] = zip(*overlap_percentages)
pfam_rp_75['overl_len'], pfam_rp_75['overl_pfam'], pfam_rp_75['overl_hmm'], pfam_rp_75['non_overl_len'], pfam_rp_75['overl_perc'] = zip(*overlap_pfam_hmm)
# print("DisProt start:", curated_query['start'].iloc[i-1], "\n"
#       "DisProt end:", curated_query['end'].iloc[i-1], "\n"
#       "DisProt length:", curated_query['length'].iloc[i-1])
pfam_rp_75.to_csv(f'results/pfam/pfam_overlap/rp_75/pfam_rp_75_{id_dis}_{i}.csv', index=False)
pfam_rp_75.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overl_len,overl_pfam,overl_hmm,non_overl_len,overl_perc
0,A0A010QN55,PF01380,IPR001347,381,508,128,330,362,33,0,0.0,0.0,198,0.0
1,A0A010QN55,PF01380,IPR001347,553,682,130,330,362,33,0,0.0,0.0,544,0.0
2,A0A016UWB3,PF01380,IPR001347,173,300,128,104,155,52,0,0.0,0.0,215,0.0
3,A0A016UWB3,PF01380,IPR001347,343,432,90,104,155,52,0,0.0,0.0,517,0.0
4,A0A016UWX5,PF01380,IPR001347,202,329,128,133,184,52,0,0.0,0.0,215,0.0


In [2261]:
pfam_rp_15[pfam_rp_15['overl_len'] != 0]

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overl_len,overl_pfam,overl_hmm,non_overl_len,overl_perc
335,A0A6P4D783,PF01380,IPR001347,181,309,129,305,350,46,5,3.88,10.87,166,2.94
547,T1HE95,PF01380,IPR001347,364,492,129,342,376,35,13,10.08,37.14,139,8.61


In [2262]:
pfam_rp_75[pfam_rp_75['overl_len'] != 0]

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overl_len,overl_pfam,overl_hmm,non_overl_len,overl_perc
107,A0A0C4EII3,PF01380,IPR001347,367,472,106,326,367,42,1,0.94,2.38,147,0.68
794,A0A2T0WH78,PF00725,IPR006108,479,570,92,532,580,49,39,42.39,79.59,64,38.24
1573,A0A6P4D783,PF01380,IPR001347,181,309,129,305,350,46,5,3.88,10.87,166,2.94
1917,A0A8C4HUS4,PF01380,IPR001347,352,479,128,322,362,41,11,8.59,26.83,148,6.96
1921,A0A8C4HYL4,PF01380,IPR001347,301,428,128,328,368,41,41,32.03,100.0,88,32.03


### 4.2 Pfam results preprocessing (overall)

In [2265]:
# Combine all pfam results (RP 15%)
pfam_path = f'{directory}/results/pfam/pfam_overlap/rp_15'
dataframes = []

for filename in os.listdir(pfam_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(pfam_path, filename)
        df = pd.read_csv(file_path)
        dataframes.append(df)
        
pfam_hmm_rp_15_1 = pd.concat(dataframes, ignore_index=True)
pfam_hmm_rp_15_1.to_csv('results/pfam/pfam_hmm_rp_15_1.csv', index=False)
print(len(pfam_hmm_rp_15_1))
pfam_hmm_rp_15_1.head()

1651


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overl_len,overl_pfam,overl_hmm,non_overl_len,overl_perc
0,A0A2K5X283,PF16959,IPR031588,617,770,154,769,805,37,2,1.3,5.41,188,1.06
1,A0A2K6SBD4,PF16959,IPR031588,618,770,153,769,805,37,2,1.31,5.41,187,1.06
2,A0A6J7ZVR2,PF00754,IPR000421,16,86,71,173,189,17,0,0.0,0.0,261,0.0
3,A0A6J7ZVR2,PF00754,IPR000421,149,255,107,173,189,17,17,15.89,100.0,91,15.89
4,A0A6J7ZVR2,PF00754,IPR000421,294,400,107,173,189,17,0,0.0,0.0,333,0.0


In [2266]:
# Combine all pfam results (RP 75%)
pfam_path = f'{directory}/results/pfam/pfam_overlap/rp_75'
dataframes = []

for filename in os.listdir(pfam_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(pfam_path, filename)
        df = pd.read_csv(file_path)
        dataframes.append(df)
        
pfam_hmm_rp_75_1 = pd.concat(dataframes, ignore_index=True)
pfam_hmm_rp_75_1.to_csv('results/pfam/pfam_hmm_rp_75_1.csv', index=False)
print(len(pfam_hmm_rp_75_1))
pfam_hmm_rp_75_1.head()

23010


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overl_len,overl_pfam,overl_hmm,non_overl_len,overl_perc
0,A0A060VYC8,PF07686,IPR013106,21,101,81,15,110,96,81,100.0,84.38,16,84.38
1,A0A060VYC8,PF08205,IPR013162,119,197,79,15,110,96,0,0.0,0.0,192,0.0
2,A0A060W1A4,PF07686,IPR013106,23,99,77,18,110,93,77,100.0,82.8,17,82.8
3,A0A060W1A4,PF08205,IPR013162,118,195,78,18,110,93,0,0.0,0.0,186,0.0
4,A0A060WNT4,PF07686,IPR013106,27,108,82,21,116,96,82,100.0,85.42,15,85.42
