In [1]:
# Importing the libraries
from functions import *

## HMM: Data preparation
In this part we generate HMM models based on MSA and retrieve the results from hmmsearch and then from Pfam.

In [2]:
# Set the maximum width of the columns
pd.set_option('display.max_colwidth', 20)

In [3]:
# Load the disordered dataframe
disordered = pd.read_csv('disordered_df.csv')
curated_disprot = pd.read_csv('curated_disprot.csv')

print(f'The number of rows with the disordered regions: {len(disordered)}')
# disordered.head()

The number of rows with the disordered regions: 7393


In [4]:
# Collect the disordered regions in a dictionary
dis_regs = set()

for i, row in disordered.iterrows():
    dis_id = row[0]
    matching_row = curated_disprot[curated_disprot['acc'] == dis_id]
    if not matching_row.empty:
        region = matching_row['region']
        dis_regs.update(region)
        dis_regs_list = list(dis_regs) # convert set to a list
        
# Define an array of disordered regions ids
disprot_ids = disordered['query_id'].unique()

# print('The disordered regions:', dis_regs_list)
print(f'The number of the proteins with the disordered regions: {len(disprot_ids)}')
print(f'The number of the disordered regions in the proteins: {len(dis_regs_list)}')

The number of the proteins with the disordered regions: 39
The number of the disordered regions in the proteins: 53


The results above show how many proteins with the disordered regions are in the provided data, as well as how many disordered regions do they contain.

## 1. hmmbuild
We build an HMM of each disordered region, using trimmed MSA as an input. We will build the models separately for BLAST and ClustalOmega to be able to compare the quality of the models based on the local and the global alignment.

In [5]:
# Set the paths to HMM and MSA files - BLAST
hmm_dir_blast = f'{directory}/results/hmms/hmmbuild/blast'
align_dir_blast = f'{directory}/results/alignments/output_files/disordered/blast'

# Set the paths to HMM and MSA files - ClustalOmega
hmm_dir_clustal = f'{directory}/results/hmms/hmmbuild/clustal'
align_dir_clustal = f'{directory}/results/alignments/output_files/disordered/clustal'

In [6]:
# Build HMM for BLAST
for filename in os.listdir(align_dir_blast):
    if filename.endswith('.fasta'):
        input_file = os.path.join(align_dir_blast, filename)
        output_file = os.path.join(hmm_dir_blast, os.path.splitext(filename)[0] + '.hmm')

        subprocess.run(['hmmbuild', output_file, input_file])
        print(f'hmmbuild completed for {filename}')

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/S7W634_1-32.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/S7W634_1-32.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     S7W634_1-32            200    31    31    97.18  1.740 

# CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00.03
hmmbuild completed for S7W634_1-32.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); h

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/Q9H832_1-99.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/Q9H832_1-99.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q9H832_1-99            200    98    98     2.94  0.590 

# CPU time: 0.04u 0.00s 00:00:00.04 Elapsed: 00:00:00.04
hmmbuild completed for Q9H832_1-99.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); h

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/Q8K4J6_155-186.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/Q8K4J6_155-186.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q8K4J6_155-186         200    31    31     4.06  1.741 

# CPU time: 0.01u 0.00s 00:00:00.01 Elapsed: 00:00:00.01
hmmbuild completed for Q8K4J6_155-186.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/S6B291_240-465.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/S6B291_240-465.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     S6B291_240-465         200   225   225     1.19  0.589 

# CPU time: 0.08u 0.00s 00:00:00.08 Elapsed: 00:00:00.08
hmmbuild completed for S6B291_240-465.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov

hmmbuild completed for P07342_580-595.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/Q9BYI3_149-253.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/Q9BYI3_149-253.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q9BYI3_149-253         200   104   104     0.41  0.589 

# CPU time: 0.04u 0.00s 00:00:00.04 Elapsed: 00:00:00.04
hmmbuild completed for Q9BYI3_149-253.fasta
# hmmbuild :: profile HMM construction from mul

In [7]:
# Build HMM for ClustalOmega
for filename in os.listdir(align_dir_clustal):
    if filename.endswith('.fasta'):
        input_file = os.path.join(align_dir_clustal, filename)
        output_file = os.path.join(hmm_dir_clustal, os.path.splitext(filename)[0] + '.hmm')

        subprocess.run(['hmmbuild', output_file, input_file])
        print(f'hmmbuild completed for {filename}')

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/S7W634_1-32.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/S7W634_1-32.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     S7W634_1-32            196    31    30   196.00  1.348 

# CPU time: 0.01u 0.00s 00:00:00.01 Elapsed: 00:00:00.01
hmmbuild completed for S7W634_1-32.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/Q9H832_1-99.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/Q9H832_1-99.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q9H832_1-99            190    98    91     7.24  0.627 

# CPU time: 0.04u 0.00s 00:00:00.04 Elapsed: 00:00:00.03
hmmbuild completed for Q9H832_1-99.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/Q8K4J6_155-186.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/Q8K4J6_155-186.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q8K4J6_155-186         194    31    31     1.68  1.738 

# CPU time: 0.01u 0.00s 00:00:00.01 Elapsed: 00:00:00.01
hmmbuild completed for Q8K4J6_155-186.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/S6B291_240-465.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/S6B291_240-465.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     S6B291_240-465         193   225   220     0.64  0.590 

# CPU time: 0.08u 0.00s 00:00:00.08 Elapsed: 00:00:00.08
hmmbuild completed for S6B291_240-465.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/P07342_580-595.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/P07342_580-595.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     P07342_580-595         194    15    15     5.31  3.461 

# CPU time: 0.01u 0.00s 00:00:00.01 Elapsed: 00:00:00.01
hmmbuild completed for P07342_580-595.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 

## 2. hmmsearch

After building the models, our objective is to assess if overlaps with the profiles in the sequence database (Reference Proteome 75%) exist. We retrive the data containing the most significant sequences, with a default E-value threshold of 0.01.

The same as before, we do it for BLAST and ClustalOmega separately.

In [8]:
# Copy the HMMs from the BLAST folder
for filename in os.listdir(hmm_dir_blast):
    source_path = os.path.join(hmm_dir_blast, filename)
    destination_path = f'{name}@{server}:~/hmm/blast/hmmbuild/{filename}'
    subprocess.run(['scp', source_path, destination_path])
    print(f'File {filename} copied to {destination_path}')

File O43791_169-178.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/O43791_169-178.hmm
File P04370-5_1-169.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/P04370-5_1-169.hmm
File Q9H832_327-354.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/Q9H832_327-354.hmm
File O35274_584-602.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/O35274_584-602.hmm
File Q8IW19_399-420.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/Q8IW19_399-420.hmm
File Q8R464_25-120.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/Q8R464_25-120.hmm
File Q93KQ4_51-81.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/Q93KQ4_51-81.hmm
File P01019_436-450.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/P01019_436-450.hmm
File S7W634_1-32.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/S7W634_1-32.hmm
File Q99967_220-269.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/Q99967_220-269.hmm
File Q5T4W7_108-120.hmm copied to alina@ecate:~/hmm/blast/hmmbuild/Q5T4W7_108-120.hmm
File A4L7I2_1658-1856.hmm copied to alina@ecate:~/hmm/blast/hmmbui

In [9]:
# Copy the HMMs from the Clustal folder
for filename in os.listdir(hmm_dir_clustal):
    source_path = os.path.join(hmm_dir_clustal, filename)
    destination_path = f'{name}@{server}:~/hmm/clustal/hmmbuild/{filename}'
    subprocess.run(['scp', source_path, destination_path])
    print(f'File {filename} copied to {destination_path}')

File O43791_169-178.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/O43791_169-178.hmm
File P04370-5_1-169.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/P04370-5_1-169.hmm
File Q9H832_327-354.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/Q9H832_327-354.hmm
File O35274_584-602.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/O35274_584-602.hmm
File Q8IW19_399-420.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/Q8IW19_399-420.hmm
File Q8R464_25-120.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/Q8R464_25-120.hmm
File Q93KQ4_51-81.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/Q93KQ4_51-81.hmm
File P01019_436-450.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/P01019_436-450.hmm
File S7W634_1-32.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/S7W634_1-32.hmm
File Q99967_220-269.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/Q99967_220-269.hmm
File Q5T4W7_108-120.hmm copied to alina@ecate:~/hmm/clustal/hmmbuild/Q5T4W7_108-120.hmm
File A4L7I2_1658-1856.hmm copied to alina@ec

After copying the files, we run `hmmsearch` command. The following code should be uncommented if there are new files provided. However, it requires some time to retrieve the output.

In [11]:
# Construct the source and the target directories for BLAST
blast_hmmbuild_dir = f"/home/{name}/hmm/blast/hmmbuild"
blast_hmmsearch_dir = f"/home/{name}/hmm/blast/hmmsearch"

# List files in the source directory on the remote machine
ls_command = f'ssh {name}@{server} "ls {blast_hmmbuild_dir}"'
file_list = !{ls_command}

# # Iterate over each filename - RUNS FOR A LONG TIME
# for filename in file_list:
#     filename = filename.strip()
#     source_path = f'{blast_hmmbuild_dir}/{filename}'
#     dest_filename = f'{os.path.splitext(filename)[0]}'
#     destination_path = f'{blast_hmmsearch_dir}/hmmsearch_blast_{dest_filename}.txt'
    
#     # Run hmmsearch command using ssh without redirection
#     !ssh {name}@{server} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {source_path} /db/rp/rp-seqs-75.fasta.gz > {destination_path} 2>&1"
        
#     print(f"hmmsearch completed for {filename} - Results saved to {destination_path}")

In [12]:
# Copy BLAST results to the local folder
for filename in file_list:
    filename = filename.strip()
    source_path = f'{blast_hmmbuild_dir}/{filename}'
    dest_filename = f'{os.path.splitext(filename)[0]}'
    
    # Copy files
    !scp {name}@{server}:~/hmm/blast/hmmsearch/hmmsearch_blast_{dest_filename}.txt hmm_dir_blast        

hmmsearch_blast_A1L1Q4_1-71.txt               100%  130MB   6.4MB/s   00:20    
hmmsearch_blast_A4L7I2_1658-1856.txt          100%   43KB 571.8KB/s   00:00    
hmmsearch_blast_A8AZZ3_116-134.txt            100%   24KB 640.9KB/s   00:00    
hmmsearch_blast_A8AZZ3_146-195.txt            100%   13KB 220.4KB/s   00:00    
hmmsearch_blast_A8AZZ3_24-44.txt              100%   75KB 935.2KB/s   00:00    
hmmsearch_blast_J8TM36_236-249.txt            100% 1803    28.8KB/s   00:00    
hmmsearch_blast_O00308_662-702.txt            100% 7658KB   5.1MB/s   00:01    
hmmsearch_blast_O00585_71-111.txt             100% 3835KB   5.7MB/s   00:00    
hmmsearch_blast_O14727_1-104.txt              100% 2407KB   5.8MB/s   00:00    
hmmsearch_blast_O14958_371-399.txt            100% 2859    54.2KB/s   00:00    
hmmsearch_blast_O15922_1-89.txt               100%   86KB   1.7MB/s   00:00    
hmmsearch_blast_O15922_230-240.txt            100% 1805    18.1KB/s   00:00    
hmmsearch_blast_O15922_315-324.txt      

In [14]:
# Construct the source and the target directories for ClustalOmega
clustal_hmmbuild_dir = f"/home/{name}/hmm/clustal/hmmbuild"
clustal_hmmsearch_dir = f"/home/{name}/hmm/clustal/hmmsearch"

# List files in the source directory on the remote machine
ls_command = f'ssh {name}@{server} "ls {clustal_hmmbuild_dir}"'
file_list = !{ls_command}

# # Iterate over each filename
# for filename in file_list:
#     filename = filename.strip()
#     source_path = f'{clustal_hmmbuild_dir}/{filename}'
#     dest_filename = f'{os.path.splitext(filename)[0]}'
#     destination_path = f'{clustal_hmmsearch_dir}/hmmsearch_clustal_{dest_filename}.txt'
    
#     # Run hmmsearch command using ssh without redirection
#     !ssh {name}@{server} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {source_path} /db/rp/rp-seqs-75.fasta.gz > {destination_path} 2>&1"
        
#     print(f"hmmsearch completed for {filename} - Results saved to {destination_path}")

In [15]:
# Copy ClustalOmega results to the local folder
for filename in file_list:
    filename = filename.strip()
    source_path = f'{clustal_hmmbuild_dir}/{filename}'
    dest_filename = f'{os.path.splitext(filename)[0]}'
    
    # Copy files
    !scp {name}@{server}:~/hmm/clustal/hmmsearch/hmmsearch_clustal_{dest_filename}.txt hmm_dir_clustal

hmmsearch_clustal_A1L1Q4_1-71.txt             100%  134MB   6.1MB/s   00:22    


### 2.1 hmmsearch results preprocessing
Then we preprocess the data obtained from hmmsearch into the dataframes.

In [17]:
# Set the paths to hmmsearch results for BLAST and ClustalOmega
hmmsearch_dir_blast = f'{directory}/results/hmms/hmmsearch/blast'
hmmsearch_dir_clustal = f'{directory}/results/hmms/hmmsearch/clustal'

In [22]:
# BLAST hmmsearch dataframe
df_blast = []

for filename in os.listdir(hmmsearch_dir_blast):
    if filename.endswith('.txt'):
        file_path = os.path.join(hmmsearch_dir_blast, filename)
        hmmsearch_file = process_hmmsearch_file(file_path)
#         df = pd.read_csv(hmmsearch_file)
        df_blast.append(hmmsearch_file)
        
hmmsearch_blast_df = pd.concat(df_blast, ignore_index=True)
# hmmsearch_blast_df.to_csv('results/pfam/hmmsearch_blast_df.csv', index=False)
print(len(hmmsearch_blast_df))
hmmsearch_blast_df.head()

ValueError: '  ------ inclusion threshold ------\n' is not in list

In [820]:
# 1. Make a dataframe with the statistics - the first table of the hmmsearch output
stats_rp_75 = process_hmmsearch_file(f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_75_{id_dis}_{i}.txt")
# stats_rp_75.head()

In [821]:
# 2. Create a dataframe with the extracted regions from HMM - from the rest of the file
hmm_rp_75 = f"{directory}/results/hmms/hmmsearch/hmmsearch_rp_75_{id_dis}_{i}.txt"
hmmsearch_regions_rp_75 = extract_table_from_output(hmm_rp_75)
# hmmsearch_regions_rp_75.head()

In [822]:
# 3. Combine the results into a dataframe and save the file
hmmsearch_results_rp_75 = pd.merge(stats_rp_75, hmmsearch_regions_rp_75, left_on="Sequence", right_on="id", how="inner")
hmmsearch_results_rp_75 = hmmsearch_results_rp_75.drop(columns=["Description", "id"])
hmmsearch_results_rp_75.to_csv(f"{directory}/results/hmms/hmmsearch/stats/stats_rp_75_{id_dis}_{i}.csv", index=False)
print(f"Number of hmmsearch hits for the {i} disordered region of the {id_dis} protein (RP 75%):", len(hmmsearch_results_rp_75))
%store hmmsearch_results_rp_75
hmmsearch_results_rp_75[:20]

Number of hmmsearch hits for the 1 disordered region of the Q86FP8 protein (RP 75%): 1339
Stored 'hmmsearch_results_rp_75' (DataFrame)


Unnamed: 0,E-value,score,bias,E-value.1,score.1,bias.1,exp,N,Sequence,ali_from,ali_to,ali_length
0,7e-33,125.0,0.2,1.6e-13,63.0,0.0,2.5,2,A0A5J9WTQ4,286,340,55
1,2.7e-20,84.6,0.0,5.3e-20,83.7,0.0,1.5,1,A0A0D3D564,283,339,57
2,5.8e-20,83.6,0.0,1.2e-19,82.6,0.0,1.5,1,A0A397YI80,277,333,57
3,5.8e-20,83.6,0.0,1.2e-19,82.6,0.0,1.5,1,M4DEU0,277,333,57
4,7.7e-20,83.2,0.0,1.7e-19,82.1,0.0,1.5,1,M4CCJ6,283,339,57
5,7.7e-20,83.2,0.0,1.7e-19,82.1,0.0,1.5,1,A0A398A8X2,283,339,57
6,7.8e-20,83.2,0.0,1.7e-19,82.1,0.0,1.5,1,A0A0D3BCZ4,284,340,57
7,2e-19,81.9,0.0,4.4e-19,80.8,0.0,1.6,1,A0A6J0NM38,277,333,57
8,2e-19,81.8,0.0,4.5e-19,80.7,0.0,1.6,1,A9TSC0,310,360,51
9,2.9e-19,81.3,0.1,6.5e-19,80.2,0.1,1.6,1,A0A087HC55,290,345,56


In [743]:
subject_ids_rp_75 = hmmsearch_results_rp_75.iloc[:, 8].unique()
print(f"The number of unique IDs for the {id_dis} MSA:", len(subject_ids_rp_75))

The number of unique IDs for the Q86FP8 MSA: 1339


## 3. Pfam
### 3.1 Pfam results preprocessing

In [744]:
# # Copy the files with the statistics to the remote computer
# !scp {directory}/results/hmms/hmmsearch/stats/stats_rp_75_{id_dis}_{i}.csv {name}@{server}:~/stats/stats_rp_75/stats_rp_75_{id_dis}_{i}.csv

In [746]:
# # Check the overlaps with Interpro domains (RP 75%)
# !ssh {name}@{server} "/home/alina/protein2ipr.py /home/alina/stats/stats_rp_75 /home/alina/filtered.tsv.gz protein2ipr_rp_75.tsv"

In [747]:
# # Copy the files to the local folder
# !scp {name}@{server}:~/protein2ipr_rp_75.tsv {directory}/results/pfam

In [823]:
# Filter only entries with Pfam ID and intercepting regions with the curated_disprot instances (RP 75%)
filename = f'{directory}/results/pfam/protein2ipr_rp_75.tsv'
pfam_rp_75 = pfam_processing(filename)

# pfam_rp_75 = pfam_rp_75[pfam_rp_75['uniprot_id'].isin(subject_ids_rp_75)]
# print(f"The number of retrieved Pfam instances for the {id_dis}_{i} protein (RP 75%): {len(pfam_rp_75)}")
# # print("The number of Uniprot instances not covered by Pfam (RP 75%):", len(hmmsearch_results_rp_75) - len(pfam_rp_75))
# pfam_rp_75.head()

In [825]:
pfam_rp_75[:20]

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam
0,A0A010QN55,PF01380,IPR001347,381,508,128
1,A0A010QN55,PF01380,IPR001347,553,682,130
2,A0A016UWB3,PF01380,IPR001347,173,300,128
3,A0A016UWB3,PF01380,IPR001347,343,432,90
4,A0A016UWX5,PF01380,IPR001347,202,329,128
5,A0A016UWX5,PF01380,IPR001347,372,461,90
6,A0A017S8W7,PF01380,IPR001347,375,502,128
7,A0A017S8W7,PF01380,IPR001347,548,675,128
8,A0A022QGG4,PF01380,IPR001347,365,493,129
9,A0A022QGG4,PF01380,IPR001347,536,667,132


In [751]:
# Merge with hmmsearch result - for one MSA (RP 75%)
pfam_rp_75 = pd.merge(pfam_rp_75, hmmsearch_results_rp_75[['Sequence', 'ali_from', 'ali_to', 'ali_length']], left_on='uniprot_id', right_on='Sequence', how='left')
pfam_rp_75 = pfam_rp_75.dropna(axis=0)
pfam_rp_75['ali_from'] = pfam_rp_75['ali_from'].astype(int)
pfam_rp_75['ali_to'] = pfam_rp_75['ali_to'].astype(int)
pfam_rp_75['ali_length'] = pfam_rp_75['ali_length'].astype(int)
pfam_rp_75 = pfam_rp_75.drop(columns='Sequence')
# pfam_rp_75.to_csv(f'results/pfam/pfam_overlap/pfam_overlap_{id_dis}.csv', index=False)
# print(f"The number of overlaps for {id_dis}_{i} (RP 75%):", len(pfam_rp_75))
pfam_rp_75.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length
0,A0A010QN55,PF01380,IPR001347,381,508,128,330,362,33
1,A0A010QN55,PF01380,IPR001347,553,682,130,330,362,33
2,A0A016UWB3,PF01380,IPR001347,173,300,128,104,155,52
3,A0A016UWB3,PF01380,IPR001347,343,432,90,104,155,52
4,A0A016UWX5,PF01380,IPR001347,202,329,128,133,184,52


In [752]:
# Calculate the overlaps percentage
def pfam_hmm_overlap(row_pfam):
    start_pfam = row_pfam['start_pfam']
    end_pfam = row_pfam['end_pfam']
    start_hmm = row_pfam['ali_from']
    end_hmm = row_pfam['ali_to']
    len_pfam = row_pfam['length_pfam']
    len_hmm = row_pfam['ali_length']

    overlap_len = min(end_pfam, end_hmm) - max(start_pfam, start_hmm) + 1
    overlap_pfam = overlap_len/len_pfam * 100
    overlap_hmm = overlap_len/len_hmm * 100
    max_length = len_pfam + len_hmm - overlap_len
    non_overlap_len = max_length - overlap_len + 1

    if overlap_len > 0:
        overlap_perc = (overlap_len / max_length) * 100
#         overlap_perc = 2 * overlap_len / (len_hmm + len_pfam)
    else:
        overlap_len = 0
        overlap_pfam = 0
        overlap_hmm = 0
        overlap_perc = 0

    return overlap_len, overlap_pfam, overlap_hmm, non_overlap_len, overlap_perc

In [754]:
# Add the overlaps to the dataframe (RP 75%)
# overlap_percentages = []
overlap_pfam_hmm = []

# for index_pfam, row_pfam in pfam_rp_75.iterrows():
#     row_disprot = curated_query.iloc[i-1]
#     overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm = calculate_overlap(row_pfam, row_disprot)
#     overl_perc_pfam = round(overl_perc_pfam, 2)
#     overl_perc_hmm = round(overl_perc_hmm, 2)
#     overlap_percentages.append((overl_perc_pfam, overl_perc_hmm, overl_len_pfam, overl_len_hmm))

for index_pfam, row_pfam in pfam_rp_75.iterrows():
    overl_len, overl_pfam, overl_hmm, non_overl_len, overl_perc = pfam_hmm_overlap(row_pfam)
    overl_pfam = round(overl_pfam, 2)
    overl_hmm = round(overl_hmm, 2)
    overl_perc = round(overl_perc, 2)
    overlap_pfam_hmm.append((overl_len, overl_pfam, overl_hmm, non_overl_len, overl_perc))

# Extract overlap_pfam and overlap_hmm from the list of tuples
# pfam_rp_75['overlap_pfam_%'], pfam_rp_75['overlap_hmm_%'], pfam_rp_75['overlap_pfam_len'], pfam_rp_75['overlap_hmm_len'] = zip(*overlap_percentages)
pfam_rp_75['overl_len'], pfam_rp_75['overl_pfam'], pfam_rp_75['overl_hmm'], pfam_rp_75['non_overl_len'], pfam_rp_75['overl_perc'] = zip(*overlap_pfam_hmm)
# print("DisProt start:", curated_query['start'].iloc[i-1], "\n"
#       "DisProt end:", curated_query['end'].iloc[i-1], "\n"
#       "DisProt length:", curated_query['length'].iloc[i-1])
pfam_rp_75.to_csv(f'results/pfam/pfam_overlap/rp_75/pfam_rp_75_{id_dis}_{i}.csv', index=False)
pfam_rp_75.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overl_len,overl_pfam,overl_hmm,non_overl_len,overl_perc
0,A0A010QN55,PF01380,IPR001347,381,508,128,330,362,33,0,0.0,0.0,198,0.0
1,A0A010QN55,PF01380,IPR001347,553,682,130,330,362,33,0,0.0,0.0,544,0.0
2,A0A016UWB3,PF01380,IPR001347,173,300,128,104,155,52,0,0.0,0.0,215,0.0
3,A0A016UWB3,PF01380,IPR001347,343,432,90,104,155,52,0,0.0,0.0,517,0.0
4,A0A016UWX5,PF01380,IPR001347,202,329,128,133,184,52,0,0.0,0.0,215,0.0


In [756]:
pfam_rp_75[pfam_rp_75['overl_len'] != 0]

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overl_len,overl_pfam,overl_hmm,non_overl_len,overl_perc
107,A0A0C4EII3,PF01380,IPR001347,367,472,106,326,367,42,1,0.94,2.38,147,0.68
794,A0A2T0WH78,PF00725,IPR006108,479,570,92,532,580,49,39,42.39,79.59,64,38.24
1573,A0A6P4D783,PF01380,IPR001347,181,309,129,305,350,46,5,3.88,10.87,166,2.94
1917,A0A8C4HUS4,PF01380,IPR001347,352,479,128,322,362,41,11,8.59,26.83,148,6.96
1921,A0A8C4HYL4,PF01380,IPR001347,301,428,128,328,368,41,41,32.03,100.0,88,32.03


### 3.2 Pfam results preprocessing (overall)

In [760]:
# Combine all pfam results (RP 75%)
pfam_path = f'{directory}/results/pfam/pfam_overlap/rp_75'
dataframes = []

for filename in os.listdir(pfam_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(pfam_path, filename)
        df = pd.read_csv(file_path)
        dataframes.append(df)
        
pfam_hmm_rp_75_1 = pd.concat(dataframes, ignore_index=True)
pfam_hmm_rp_75_1.to_csv('results/pfam/pfam_hmm_rp_75_1.csv', index=False)
print(len(pfam_hmm_rp_75_1))
pfam_hmm_rp_75_1.head()

23010


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,ali_from,ali_to,ali_length,overl_len,overl_pfam,overl_hmm,non_overl_len,overl_perc
0,A0A060VYC8,PF07686,IPR013106,21,101,81,15,110,96,81,100.0,84.38,16,84.38
1,A0A060VYC8,PF08205,IPR013162,119,197,79,15,110,96,0,0.0,0.0,192,0.0
2,A0A060W1A4,PF07686,IPR013106,23,99,77,18,110,93,77,100.0,82.8,17,82.8
3,A0A060W1A4,PF08205,IPR013162,118,195,78,18,110,93,0,0.0,0.0,186,0.0
4,A0A060WNT4,PF07686,IPR013106,27,108,82,21,116,96,82,100.0,85.42,15,85.42
