In [108]:
# Importing the libraries
from functions import *

## 1. HMM: Data preparation
We use the XML file with the BLAST results as an input. These files contain the information necessary to reproduce the MSA BLAST alignment: the query and subject IDs and sequences, the sequences lengths and positions (start-end).

Here the work is performed on the one fasta file: `curated_uniprot.fasta_75`.

In [109]:
# Open XML file as a dataframe
input_file = '{}/databases/uniprot/curated_uniprot.fasta_75'.format(directory) # change the file name if necessary
df = blast_parser(input_file)
print(f"The number of instances: {len(df)}")
df.head(5)

The number of instances: 8656


Unnamed: 0,query_id,subject_id,query_len,hsp_len,query_seq,match_seq,subject_seq,query_start,query_end,subject_start,subject_end,identity,positive,gaps,eval,bit_score,count
0,Q16620,A0A2R9BM51,822,822,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,1,822,1,822,822,822,0,0.0,4458.0,200
1,Q16620,A0A4X2LP40,822,824,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,M SW + HGP MARL GFCWLV+ FWR + ACPTSC CS +RIWCS...,MLSWKKCHGPGMARLLGFCWLVLIFWRGSQACPTSCTCSTTRIWCS...,1,822,1,824,689,745,2,0.0,3749.0,200
2,Q16620,A0A4X2LDU8,822,822,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,M SW + HGP MARL GFCWLV+ FWR + ACPTSC CS +RIWCS...,MLSWKKCHGPGMARLLGFCWLVLIFWRGSQACPTSCTCSTTRIWCS...,1,822,1,821,689,745,1,0.0,3756.0,200
3,Q16620,A0A6P5IKH1,822,824,MSSWIRWHGPAMARLWGFCWLVVGFWRAAFACPTSCKCSASRIWCS...,M SW + HGP MARL GFCWLV+ FWR + ACPTSC CS +RIWCS...,MLSWKKCHGPGMARLLGFCWLVLIFWRGSQACPTSCTCSTTRIWCS...,1,822,1,824,691,745,2,0.0,3764.0,200
4,Q16620,A0A7J7UQA2,822,741,TSCKCSASRIWCSDPSPGIVAFPRLEPNSVDPENITEIFIANQKRL...,+SCKCSASRIWCSDP PGI+AFPRLEPN++DPENITEI+IANQKRL...,SSCKCSASRIWCSDPIPGIMAFPRLEPNTIDPENITEIYIANQKRL...,34,774,2,739,710,724,3,0.0,3776.0,200


To extract the information regarding the disordered regions we use the `curated.mjson` file which contains the infromation from the DisProt database. As well as in the `curated_uniprot.fasta`, there are the list of Uniprot IDs. There are also start and end position of disordered regions. We will use this database to filter out only the positions containing the disordered regions.

In [110]:
# Create a dataframe with Disprot instances from curated.mjson database
data = list()

with open('curated.mjson', 'r') as file:
    for line in file:
        obj = json.loads(line)
        rows = json_parser(obj)
        data.extend(rows)

curated_disprot = pd.DataFrame(data)

# Calculate the length of disordered regions
curated_disprot['length'] = curated_disprot['end'] - curated_disprot['start'] + 1
curated_disprot = curated_disprot[(curated_disprot['feature'] == 'disorder') & (curated_disprot['source'] == 'disprot')]
curated_disprot['region'] = curated_disprot.apply(lambda row: f"{row['acc']}_{row['start']}-{row['end']}", axis=1)
curated_disprot.to_csv("curated_disprot.csv", index=False)

print(f"The number of instances in the Curated Disprot database: {len(curated_disprot)}")
curated_disprot.head()

The number of instances in the Curated Disprot database: 3151


Unnamed: 0,acc,evidence,feature,source,start,end,length,region
6927,P03265,curated,disorder,disprot,294,334,41,P03265_294-334
6928,P03265,curated,disorder,disprot,454,464,11,P03265_454-464
6929,P49913,curated,disorder,disprot,134,170,37,P49913_134-170
6930,P03045,curated,disorder,disprot,1,107,107,P03045_1-107
6931,P00004,curated,disorder,disprot,1,105,105,P00004_1-105


In [112]:
# Keep only disordered regions in the initial dataframe filtering with curated_disordered dataframe
"""
Using `acc` column from `curated_disprot`, 
select only the instances with the disordered regions 
from the `curated_uniprot.fasta`
"""
disordered = df[df['query_id'].isin(curated_disprot['acc'])]

disordered.to_csv("disordered_df.csv", index=False)

print(f"The number of rows with the disordered regions: {len(disordered)}")
disordered.head()

The number of rows with the disordered regions: 2887


Unnamed: 0,query_id,subject_id,query_len,hsp_len,query_seq,match_seq,subject_seq,query_start,query_end,subject_start,subject_end,identity,positive,gaps,eval,bit_score,count
200,Q9H832,A0A6J2FM24,354,356,MAESPTEEAATA--GAGAAGPGASSVAGVVGVSGSGGGFGPPFLPD...,MAESPTEEAATA GAGAAGPGAS V GVVGVSGSG FGPPFLPD...,MAESPTEEAATATAGAGAAGPGASGVTGVVGVSGSG--FGPPFLPD...,1,354,1,354,350,350,4,0.0,1851.0,200
201,Q9H832,A0A3Q7W6Y2,354,356,MAESPTEEAATA--GAGAAGPGASSVAGVVGVSGSGGGFGPPFLPD...,MAESPTEEAATA GAGA GPGAS VAGVVGVSGSG FGPPFLPD...,MAESPTEEAATATAGAGATGPGASGVAGVVGVSGSG--FGPPFLPD...,1,354,1,354,350,350,4,0.0,1851.0,200
202,Q9H832,A0A2U3VK69,354,356,MAESPTEEAATA--GAGAAGPGASSVAGVVGVSGSGGGFGPPFLPD...,MAESPTEEAATA GAGAAGPGAS V GVVGVSGSG FGPPFLPD...,MAESPTEEAATATAGAGAAGPGASGVTGVVGVSGSG--FGPPFLPD...,1,354,1,354,350,350,4,0.0,1851.0,200
203,Q9H832,A0A2Y9JVH5,354,358,MAESPTEEAATA----GAGAAGPGASSVAGVVGVSGSGGGFGPPFL...,MAESPTEEAATA GAGAAGPGAS VAGVVGVSGSG FGPPFL...,MAESPTEEAATATATAGAGAAGPGASGVAGVVGVSGSG--FGPPFL...,1,354,1,356,351,351,6,0.0,1854.0,200
204,Q9H832,A0A8C7ALE4,354,358,MAESPTEEAATA----GAGAAGPGASSVAGVVGVSGSGGGFGPPFL...,MAESPTEEAATA GAGAAGPGAS VAGVVGVSGSG FGPPFL...,MAESPTEEAATATATAGAGAAGPGASGVAGVVGVSGSG--FGPPFL...,1,354,1,356,351,351,6,0.0,1854.0,200


In [113]:
# Define an array of disordered regions ids
disprot_ids = disordered["query_id"].unique()

# Dropdown list of Uniprot query IDs for disordered regions
output = widgets.Select(options=disprot_ids,
    rows=10,
    description='Uniprot ID: ',
    layout={'width': 'max-content'},
    disabled=False)

display(output)

Select(description='Uniprot ID: ', layout=Layout(width='max-content'), options=('Q9H832', 'Q8IW19', 'Q99967', …

In [2121]:
# Check if there are disordered regions for a particular query ID
id_dis = output.value
i = 1 # change to the necessary region
# store the data
%store id_dis i
selected_dis = disordered[disordered["query_id"].isin([id_dis])]
# selected_dis.head()

Stored 'id_dis' (str)
Stored 'i' (int)


In [2122]:
# Filter the curated_disprot dataframe based on selected ID
curated_query = curated_disprot[curated_disprot['acc'] == id_dis]
curated_query

Unnamed: 0,acc,evidence,feature,source,start,end,length,region
7799,Q9H832,curated,disorder,disprot,1,99,99,Q9H832_1-99
7800,Q9H832,curated,disorder,disprot,327,354,28,Q9H832_327-354


## 2. hmmbuild
We build an HMM of each disordered region, using trimmed MSA as an input.

In [2123]:
# Set the paths to HMM and MSA files
hmm_file = f'{directory}/results/hmms/hmmbuild/{id_dis}_{i}.hmm'
align_file = f'{directory}/results/alignments/output_files/disordered/{id_dis}_{i}.fasta'

In [2124]:
# for id_dis in disprot_ids:
#     hmm_file = f'{directory}/results/hmms/hmmbuild/{id_dis}.hmm'
#     align_file = f'{directory}/results/alignments/output_files/disordered/{id_dis}_{i}.fasta'
#     print(hmm_file, align_file)

In [2125]:
# Build HMM
!hmmbuild {hmm_file} {align_file}

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/Q9H832_1.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/Q9H832_1.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q9H832_1               201    99    99     2.96  0.590 

# CPU time: 0.05u 0.00s 00:00:00.05 Elapsed: 00:00:00.04


## 3. hmmsearch

After building the model, our objective is to assess if overlaps with the profiles in Reference Proteome 15% exist and to enrich the model by utilizing this database. We generate dataframes containing the most significant sequences, with a default E-value threshold of 0.01.

In [2126]:
# # Copy the HMM file to remote computer to perform HMM search on a cluster
# !scp {directory}/results/hmms/hmmbuild/{id_dis}_{i}.hmm {name}@{server}:~/{id_dis}_{i}.hmm # one region

In [2127]:
# # hmmsearch against Reference Proteome 15%
# !ssh {name}@{server} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {id_dis}_{i}.hmm /db/rp/rp-seqs-15.fasta.gz > hmmsearch_rp_15_{id_dis}_{i}.txt"

In [2128]:
# # hmmsearch against Reference Proteome 75%
# !ssh {name}@{server} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {id_dis}_{i}.hmm /db/rp/rp-seqs-75.fasta.gz > hmmsearch_rp_75_{id_dis}_{i}.txt"

In [2129]:
# # Copy results (RP 15%) to the local folder
# !scp {name}@{server}:~/hmmsearch_rp_15_{id_dis}_{i}.txt {directory}/results/hmms/hmmsearch/

In [2130]:
# # Copy results (RP 75%) to the local folder
# !scp {name}@{server}:~/hmmsearch_rp_75_{id_dis}_{i}.txt {directory}/results/hmms/hmmsearch/

In [2131]:
# # Remove the files from the remote folder
# !ssh {name}@{server} "rm ~/{id_dis}_{i}.hmm"
# !ssh {name}@{server} "rm ~/hmmsearch_rp_15_{id_dis}_{i}.txt"
# !ssh {name}@{server} "rm ~/hmmsearch_rp_75_{id_dis}_{i}.txt"

### 3.1 hmmsearch results preprocessing
### Reference Proteome 15%

In [2132]:
# 1. Make a dataframe with the statistics - the first table of the hmmsearch output
stats_rp_15 = process_hmmsearch_file(f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_15_{id_dis}_{i}.txt")
# stats_rp_15.head()

In [2133]:
# 2. Create a dataframe with the extracted regions from HMM - from the rest of the file
hmm_rp_15 = f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_15_{id_dis}_{i}.txt"
hmmsearch_regions_rp_15 = extract_table_from_output(hmm_rp_15)
# hmmsearch_regions_rp_15.head()

In [2134]:
# 3. Combine the results into a dataframe and save the file
hmmsearch_results_rp_15 = pd.merge(stats_rp_15, hmmsearch_regions_rp_15, left_on="Sequence", right_on="id", how="inner")
hmmsearch_results_rp_15 = hmmsearch_results_rp_15.drop(columns=["Description", "id"])
hmmsearch_results_rp_15.to_csv(f"{directory}/results/hmms/hmmsearch/stats/stats_rp_15_{id_dis}_{i}.csv", index=False)
print(f"Number of hmmsearch hits for the {i} disordered region of the {id_dis} protein (RP 15%):", len(hmmsearch_results_rp_15))
%store hmmsearch_results_rp_15
hmmsearch_results_rp_15.head()

Number of hmmsearch hits for the 1 disordered region of the Q9H832 protein (RP 15%): 11
Stored 'hmmsearch_results_rp_15' (DataFrame)


Unnamed: 0,E-value,score,bias,E-value.1,score.1,bias.1,exp,N,Sequence,ali_from,ali_to,ali_length
0,1.6e-47,169.6,18.6,2.5999999999999997e-47,168.9,18.6,1.4,1,Q9H832,1,99,99
1,3.9e-47,168.3,18.6,6.4e-47,167.6,18.6,1.4,1,A0A7N9CX17,1,99,99
2,2.5e-45,162.5,18.8,4.1e-45,161.8,18.8,1.4,1,Q3UE37,1,101,101
3,2.6e-45,162.4,19.1,4.3e-45,161.7,19.1,1.4,1,Q3B7D1,1,101,101
4,5.1e-34,126.2,22.4,9.4e-33,122.2,22.4,2.3,1,A0A2K5VU74,1,86,86


In [2135]:
subject_ids_rp_15 = hmmsearch_results_rp_15.iloc[:, 8].unique()
print(f"The number of unique IDs for the {id_dis} MSA:", len(subject_ids_rp_15))

The number of unique IDs for the Q9H832 MSA: 11


### Reference Proteome 75%

In [2136]:
# 1. Make a dataframe with the statistics - the first table of the hmmsearch output
stats_rp_75 = process_hmmsearch_file(f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_75_{id_dis}_{i}.txt")
# stats_rp_75.head()

In [2137]:
# 2. Create a dataframe with the extracted regions from HMM - from the rest of the file
hmm_rp_75 = f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_75_{id_dis}_{i}.txt"
hmmsearch_regions_rp_75 = extract_table_from_output(hmm_rp_75)
# hmmsearch_regions_rp_75.head()

In [2138]:
# 3. Combine the results into a dataframe and save the file
hmmsearch_results_rp_75 = pd.merge(stats_rp_75, hmmsearch_regions_rp_75, left_on="Sequence", right_on="id", how="inner")
hmmsearch_results_rp_75 = hmmsearch_results_rp_75.drop(columns=["Description", "id"])
hmmsearch_results_rp_75.to_csv(f"{directory}/results/hmms/hmmsearch/stats/stats_rp_75_{id_dis}_{i}.csv", index=False)
print(f"Number of hmmsearch hits for the {i} disordered region of the {id_dis} protein (RP 75%):", len(hmmsearch_results_rp_75))
%store hmmsearch_results_rp_75
hmmsearch_results_rp_75.head()

Number of hmmsearch hits for the 1 disordered region of the Q9H832 protein (RP 75%): 288
Stored 'hmmsearch_results_rp_75' (DataFrame)


Unnamed: 0,E-value,score,bias,E-value.1,score.1,bias.1,exp,N,Sequence,ali_from,ali_to,ali_length
0,1.4e-46,169.6,18.6,2.2e-46,168.9,18.6,1.4,1,K7BAZ7,1,99,99
1,1.4e-46,169.6,18.6,2.2e-46,168.9,18.6,1.4,1,Q9H832,1,99,99
2,1.5e-46,169.4,18.5,2.3999999999999997e-46,168.7,18.5,1.4,1,A0A2K5RV48,1,99,99
3,1.5e-46,169.4,18.5,2.3999999999999997e-46,168.7,18.5,1.4,1,A0A6J3I1K6,1,99,99
4,1.5e-46,169.4,18.5,2.3999999999999997e-46,168.7,18.5,1.4,1,A0A8I5UEF8,1,99,99


In [2139]:
subject_ids_rp_75 = hmmsearch_results_rp_75.iloc[:, 8].unique()
print(f"The number of unique IDs for the {id_dis} MSA:", len(subject_ids_rp_75))

The number of unique IDs for the Q9H832 MSA: 288


## 4. Pfam
### 4.1 Pfam results preprocessing

In [2140]:
# # Copy the files with the statistics to the remote computer
# !scp {directory}/results/hmms/hmmsearch/stats/stats_rp_15_{id_dis}_{i}.csv {name}@{server}:~/stats/stats_rp_15/stats_rp_15_{id_dis}_{i}.csv
# !scp {directory}/results/hmms/hmmsearch/stats/stats_rp_75_{id_dis}_{i}.csv {name}@{server}:~/stats/stats_rp_75/stats_rp_75_{id_dis}_{i}.csv

In [2141]:
# # Check the overlaps with Interpro domains (RP 15%)
# !ssh {name}@{server} "/home/alina/protein2ipr.py /home/alina/stats/stats_rp_15 /home/alina/filtered.tsv.gz protein2ipr_rp_15.tsv"

In [2142]:
# # Check the overlaps with Interpro domains (RP 75%)
# !ssh {name}@{server} "/home/alina/protein2ipr.py /home/alina/stats/stats_rp_75 /home/alina/filtered.tsv.gz protein2ipr_rp_75.tsv"

In [2143]:
# # Copy the files to the local folder
# !scp {name}@{server}:~/protein2ipr_rp_15.tsv {directory}/results/pfam
# !scp {name}@{server}:~/protein2ipr_rp_75.tsv {directory}/results/pfam

In [2144]:
# Filter only entries with Pfam ID and intercepting regions with the curated_disprot instances (RP 15%)
filename = f'{directory}/results/pfam/protein2ipr_rp_15.tsv'
pfam_rp_15 = pfam_processing(filename)

pfam_rp_15 = pfam_rp_15[pfam_rp_15['uniprot_id'].isin(subject_ids_rp_15)]
print(f"The number of retrieved Pfam instances for the {id_dis}_{i} protein (RP 15%): {len(pfam_rp_15)}")
# print("The number of Uniprot instances not covered by Pfam (RP 15%):", len(hmmsearch_results_rp_15) - len(pfam_rp_15))
pfam_rp_15.head()

The number of retrieved Pfam instances for the Q9H832_1 protein (RP 15%): 6


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam
284,A0A2K5VU74,PF00179,IPR000608,90,213,124
578,A0A4Z2GIF1,PF00179,IPR000608,133,259,127
804,A0A7N9CX17,PF00179,IPR000608,103,226,124
1300,Q3B7D1,PF00179,IPR000608,105,228,124
1301,Q3UE37,PF00179,IPR000608,105,228,124


In [2145]:
# Filter only entries with Pfam ID and intercepting regions with the curated_disprot instances (RP 75%)
filename = f'{directory}/results/pfam/protein2ipr_rp_75.tsv'
pfam_rp_75 = pfam_processing(filename)

pfam_rp_75 = pfam_rp_75[pfam_rp_75['uniprot_id'].isin(subject_ids_rp_75)]
print(f"The number of retrieved Pfam instances for the {id_dis}_{i} protein (RP 75%): {len(pfam_rp_75)}")
# print("The number of Uniprot instances not covered by Pfam (RP 75%):", len(hmmsearch_results_rp_75) - len(pfam_rp_75))
pfam_rp_75.head()

The number of retrieved Pfam instances for the Q9H832_1 protein (RP 75%): 205


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam
37,A0A060WJZ8,PF00179,IPR000608,143,266,124
424,A0A096P442,PF00179,IPR000608,103,226,124
531,A0A0D3HLA7,PF13962,IPR026961,441,546,106
532,A0A0D3HLA7,PF13962,IPR026961,980,1085,106
533,A0A0D3HLA7,PF13962,IPR026961,1591,1696,106


In [2146]:
# Merge with hmmsearch result - for one MSA (RP 15%)
pfam_rp_15 = pd.merge(pfam_rp_15, hmmsearch_results_rp_15[['Sequence', 'ali_from', 'ali_to', 'ali_length']], left_on='uniprot_id', right_on='Sequence', how='left')
pfam_rp_15 = pfam_rp_15.dropna(axis=0)
pfam_rp_15['ali_from'] = pfam_rp_15['ali_from'].astype(int)
pfam_rp_15['ali_to'] = pfam_rp_15['ali_to'].astype(int)
pfam_rp_15['ali_length'] = pfam_rp_15['ali_length'].astype(int)
pfam_rp_15 = pfam_rp_15.drop(columns='Sequence')
# pfam_rp_15.to_csv(f'results/pfam/pfam_overlap/pfam_overlap_{id_dis}.csv', index=False)
# print(f"The number of overlaps for {id_dis}_{i} (RP 15%):", len(pfam_rp_15))
pfam_rp_15.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length
0,A0A2K5VU74,PF00179,IPR000608,90,213,124,1,86,86
1,A0A4Z2GIF1,PF00179,IPR000608,133,259,127,41,129,89
2,A0A7N9CX17,PF00179,IPR000608,103,226,124,1,99,99
3,Q3B7D1,PF00179,IPR000608,105,228,124,1,101,101
4,Q3UE37,PF00179,IPR000608,105,228,124,1,101,101


In [2147]:
# Merge with hmmsearch result - for one MSA (RP 75%)
pfam_rp_75 = pd.merge(pfam_rp_75, hmmsearch_results_rp_75[['Sequence', 'ali_from', 'ali_to', 'ali_length']], left_on='uniprot_id', right_on='Sequence', how='left')
pfam_rp_75 = pfam_rp_75.dropna(axis=0)
pfam_rp_75['ali_from'] = pfam_rp_75['ali_from'].astype(int)
pfam_rp_75['ali_to'] = pfam_rp_75['ali_to'].astype(int)
pfam_rp_75['ali_length'] = pfam_rp_75['ali_length'].astype(int)
pfam_rp_75 = pfam_rp_75.drop(columns='Sequence')
# pfam_rp_75.to_csv(f'results/pfam/pfam_overlap/pfam_overlap_{id_dis}.csv', index=False)
# print(f"The number of overlaps for {id_dis}_{i} (RP 75%):", len(pfam_rp_75))
pfam_rp_75.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length
0,A0A060WJZ8,PF00179,IPR000608,143,266,124,45,139,95
1,A0A096P442,PF00179,IPR000608,103,226,124,1,99,99
2,A0A0D3HLA7,PF13962,IPR026961,441,546,106,183,257,75
3,A0A0D3HLA7,PF13962,IPR026961,980,1085,106,183,257,75
4,A0A0D3HLA7,PF13962,IPR026961,1591,1696,106,183,257,75


In [2148]:
# Add the overlaps to the dataframe (RP 15%)
overlap_percentages = []

for index_pfam, row_pfam in pfam_rp_15.iterrows():
    row_disprot = curated_query.iloc[i-1]
    overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm = calculate_overlap(row_pfam, row_disprot)
    overl_perc_pfam = round(overl_perc_pfam, 2)
    overl_perc_hmm = round(overl_perc_hmm, 2)
    overlap_percentages.append((overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm))

# Extract overlap_pfam and overlap_hmm from the list of tuples
pfam_rp_15['overlap_pfam_%'], pfam_rp_15['overlap_hmm_%'], pfam_rp_15['overlap_pfam_len'], pfam_rp_15['overlap_hmm_len'] = zip(*overlap_percentages)
print("DisProt start:", curated_query['start'].iloc[i-1], "\n"
      "DisProt end:", curated_query['end'].iloc[i-1], "\n"
      "DisProt length:", curated_query['length'].iloc[i-1])
pfam_rp_15.to_csv(f'results/pfam/pfam_overlap/rp_15/pfam_rp_15_{id_dis}_{i}.csv', index=False)
pfam_rp_15.head()

DisProt start: 1 
DisProt end: 99 
DisProt length: 99


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overlap_pfam_%,overlap_hmm_%,overlap_pfam_len,overlap_hmm_len
0,A0A2K5VU74,PF00179,IPR000608,90,213,124,1,86,86,10.1,86.87,10,86
1,A0A4Z2GIF1,PF00179,IPR000608,133,259,127,41,129,89,0.0,59.6,-33,59
2,A0A7N9CX17,PF00179,IPR000608,103,226,124,1,99,99,0.0,100.0,-3,99
3,Q3B7D1,PF00179,IPR000608,105,228,124,1,101,101,0.0,100.0,-5,99
4,Q3UE37,PF00179,IPR000608,105,228,124,1,101,101,0.0,100.0,-5,99


In [2149]:
# Add the overlaps to the dataframe (RP 75%)
overlap_percentages = []

for index_pfam, row_pfam in pfam_rp_75.iterrows():
    row_disprot = curated_query.iloc[i-1]
    overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm = calculate_overlap(row_pfam, row_disprot)
    overl_perc_pfam = round(overl_perc_pfam, 2)
    overl_perc_hmm = round(overl_perc_hmm, 2)
    overlap_percentages.append((overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm))

# Extract overlap_pfam and overlap_hmm from the list of tuples
pfam_rp_75['overlap_pfam_%'], pfam_rp_75['overlap_hmm_%'], pfam_rp_75['overlap_pfam_len'], pfam_rp_75['overlap_hmm_len'] = zip(*overlap_percentages)
print("DisProt start:", curated_query['start'].iloc[i-1], "\n"
      "DisProt end:", curated_query['end'].iloc[i-1], "\n"
      "DisProt length:", curated_query['length'].iloc[i-1])
pfam_rp_75.to_csv(f'results/pfam/pfam_overlap/rp_75/pfam_rp_75_{id_dis}_{i}.csv', index=False)
pfam_rp_75.head()

DisProt start: 1 
DisProt end: 99 
DisProt length: 99


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overlap_pfam_%,overlap_hmm_%,overlap_pfam_len,overlap_hmm_len
0,A0A060WJZ8,PF00179,IPR000608,143,266,124,45,139,95,0.0,55.56,-43,55
1,A0A096P442,PF00179,IPR000608,103,226,124,1,99,99,0.0,100.0,-3,99
2,A0A0D3HLA7,PF13962,IPR026961,441,546,106,183,257,75,0.0,0.0,-341,-83
3,A0A0D3HLA7,PF13962,IPR026961,980,1085,106,183,257,75,0.0,0.0,-880,-83
4,A0A0D3HLA7,PF13962,IPR026961,1591,1696,106,183,257,75,0.0,0.0,-1491,-83


### 4.2 Pfam results preprocessing (overall)

In [2150]:
# # Combine all pfam results (RP 15%)
# pfam_path = f'{directory}/results/pfam/pfam_overlap/rp_15'
# dataframes = []

# for filename in os.listdir(pfam_path):
#     if filename.endswith('.csv'):
#         file_path = os.path.join(pfam_path, filename)
#         df = pd.read_csv(file_path)
#         dataframes.append(df)
        
# pfam_hmm_rp_15 = pd.concat(dataframes, ignore_index=True)
# pfam_hmm_rp_15.to_csv('results/pfam/pfam_hmm_rp_15.csv', index=False)
# print(len(pfam_hmm_rp_15))
# pfam_hmm_rp_15.head()

In [2151]:
# # Combine all pfam results (RP 75%)
# pfam_path = f'{directory}/results/pfam/pfam_overlap/rp_75'
# dataframes = []

# for filename in os.listdir(pfam_path):
#     if filename.endswith('.csv'):
#         file_path = os.path.join(pfam_path, filename)
#         df = pd.read_csv(file_path)
#         dataframes.append(df)
        
# pfam_hmm_rp_75 = pd.concat(dataframes, ignore_index=True)
# pfam_hmm_rp_75.to_csv('results/pfam/pfam_hmm_rp_75.csv', index=False)
# print(len(pfam_hmm_rp_75))
# pfam_hmm_rp_75.head()