In [84]:
# Set the path and input parameters
import os
directory = os.getcwd() # the main directory of the project

# The credentials for the remote cluster
name = 'alina'
server = 'ecate'

In [85]:
# Importing the libraries
from functions import *

## HMM: Data preparation
In this part we generate HMM models based on MSA and retrieve the results from hmmsearch and then from Pfam. We start with the loading of the `disordered` dataframe and analyse how many proteins and disordered regions we have.

In [86]:
# Set the maximum width of the columns
pd.set_option('display.max_colwidth', 20)

In [87]:
# Load the disordered and curated_disprot dataframes
disordered_df = pd.read_csv('{}/disordered_df.csv'.format(directory))
curated_disprot_df = pd.read_csv('{}/curated_disprot.csv'.format(directory))

print('The number of rows with the disordered regions: {}'.format(len(disordered_df)))

The number of rows with the disordered regions: 7393


In [88]:
# Collect the disordered regions in a dictionary
dis_regs = set()

for i, row in disordered_df.iterrows():
    dis_id = row[0]
    matching_row = curated_disprot_df[curated_disprot_df['acc'] == dis_id]
    if not matching_row.empty:
        region = matching_row['region']
        dis_regs.update(region)
        dis_regs_list = list(dis_regs) # convert set to a list
        
# Define an array of disordered regions ids
disprot_ids = disordered_df['query_id'].unique()

print('The number of proteins with disordered regions: {}'.format(len(disprot_ids)))
print('The number of disordered regions in the proteins: {}'.format(len(dis_regs_list)))

The number of proteins with disordered regions: 39
The number of disordered regions in the proteins: 53


The results above show how many proteins with the disordered regions are in the provided data, as well as how many disordered regions do they contain.

## 1. hmmbuild
We build an HMM of each disordered region, using trimmed MSA as an input. We will build the models separately for BLAST and ClustalOmega to be able to compare the quality of the models based on the local and the global alignment.

In [89]:
# Set the paths to MSA (input) and HMM (output) files - BLAST
msa_dir_blast = '{}/results/alignments/output_files/disordered/blast'.format(directory)
hmmbuild_dir_blast = '{}/results/hmms/hmmbuild/blast'.format(directory)

# Set the paths to  MSA (input) and HMM (output) files - ClustalOmega
msa_dir_clustal = '{}/results/alignments/output_files/disordered/clustal'.format(directory)
hmmbuild_dir_clustal = '{}/results/hmms/hmmbuild/clustal'.format(directory)

In [91]:
# Build HMM for BLAST
for filename in os.listdir(msa_dir_blast):
    if filename.endswith('.fasta'):
        input_file = os.path.join(msa_dir_blast, filename)
        output_file = os.path.join(hmmbuild_dir_blast, os.path.splitext(filename)[0] + '.hmm')

        subprocess.run(['hmmbuild', output_file, input_file])
        print('hmmbuild completed for {}'.format(filename))

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/S7W634_1-32.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/S7W634_1-32.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     S7W634_1-32            200    32    32    75.11  1.689 

# CPU time: 0.03u 0.00s 00:00:00.03 Elapsed: 00:00:00.03
hmmbuild completed for S7W634_1-32.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); h

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/Q9H832_1-99.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/Q9H832_1-99.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q9H832_1-99            200    99    99     2.96  0.590 

# CPU time: 0.04u 0.00s 00:00:00.04 Elapsed: 00:00:00.04
hmmbuild completed for Q9H832_1-99.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); h

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/Q8K4J6_155-186.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/Q8K4J6_155-186.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q8K4J6_155-186         200    32    32     3.93  1.689 

# CPU time: 0.01u 0.00s 00:00:00.01 Elapsed: 00:00:00.01
hmmbuild completed for Q8K4J6_155-186.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/O43474_1-130.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/O43474_1-130.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     O43474_1-130           200   130   129     1.11  0.591 

# CPU time: 0.05u 0.00s 00:00:00.05 Elapsed: 00:00:00.05
hmmbuild completed for O43474_1-130.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020)

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/blast/Q8IW19_450-511.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/blast/Q8IW19_450-511.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q8IW19_450-511         200    62    62     1.25  0.902 

# CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00.02
hmmbuild completed for Q8IW19_450-511.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov

In [93]:
# Build HMM for ClustalOmega
for filename in os.listdir(msa_dir_clustal):
    if filename.endswith('.fasta'):
        input_file = os.path.join(msa_dir_clustal, filename)
        output_file = os.path.join(hmmbuild_dir_clustal, os.path.splitext(filename)[0] + '.hmm')

        subprocess.run(['hmmbuild', output_file, input_file])
        print('hmmbuild completed for {}'.format(filename))

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/S7W634_1-32.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/S7W634_1-32.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     S7W634_1-32            196    32    31   196.00  1.341 

# CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00.02
hmmbuild completed for S7W634_1-32.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/Q9H832_1-99.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/Q9H832_1-99.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q9H832_1-99            190    99    92     7.31  0.620 

# CPU time: 0.04u 0.00s 00:00:00.04 Elapsed: 00:00:00.04
hmmbuild completed for Q9H832_1-99.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/Q8K4J6_155-186.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/Q8K4J6_155-186.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q8K4J6_155-186         194    32    32     1.63  1.688 

# CPU time: 0.01u 0.00s 00:00:00.01 Elapsed: 00:00:00.01
hmmbuild completed for Q8K4J6_155-186.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/O43474_1-130.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/O43474_1-130.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     O43474_1-130           188   130   129     0.59  0.588 

# CPU time: 0.05u 0.00s 00:00:00.05 Elapsed: 00:00:00.05
hmmbuild completed for O43474_1-130.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/results/alignments/output_files/disordered/clustal/Q8IW19_450-511.fasta
# output HMM file:                  /Users/alina/HMM/results/hmms/hmmbuild/clustal/Q8IW19_450-511.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q8IW19_450-511         190    62    62     6.91  0.902 

# CPU time: 0.03u 0.00s 00:00:00.03 Elapsed: 00:00:00.02
hmmbuild completed for Q8IW19_450-511.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 

## 2. hmmsearch

After building the models, our objective is to assess if overlaps with the profiles in the sequence database (Reference Proteome 75%) exist. We retrive the data containing the most significant sequences, with a default E-value threshold of 0.01.

The same as before, we do it for BLAST and ClustalOmega separately.

In [96]:
# # Copy the HMMs from the BLAST folder
# for filename in os.listdir(hmmbuild_dir_blast):
#     source_path = os.path.join(hmmbuild_dir_blast, filename)
#     destination_path = '{}@{}:~/hmm/blast/hmmbuild/{}'.format(name, server, filename)
#     subprocess.run(['scp', source_path, destination_path])
#     print('File {} copied to {}'.format(filename, destination_path))

In [97]:
# # Copy the HMMs from the ClustalOmega folder
# for filename in os.listdir(hmmbuild_dir_clustal):
#     source_path = os.path.join(hmmbuild_dir_clustal, filename)
#     destination_path = '{}@{}:~/hmm/clustal/hmmbuild/{}'.format(name, server, filename)
#     subprocess.run(['scp', source_path, destination_path])
#     print('File {} copied to {}'.format(filename, destination_path))

After copying the files, we run `hmmsearch` command. The following code should be uncommented if there are new files provided. However, it requires some time to retrieve the output.

In [98]:
# Construct the source and the target directories for BLAST - remote computer
blast_in_dir_rem = '/home/{}/hmm/blast/hmmbuild'.format(name)
blast_out_dir_rem = '/home/{}/hmm/blast/hmmsearch'.format(name)

# List files in the source directory on the remote machine
ls_command = 'ssh {}@{} "ls {}"'.format(name, server, blast_in_dir_rem)
file_list = get_ipython().getoutput(ls_command)

# # Iterate over each filename - RUNS FOR A LONG TIME
# for filename in file_list:
#     filename = filename.strip()
#     source_path = '{}/{}'.format(blast_in_dir_rem, filename)
#     dest_filename = os.path.splitext(filename)[0]
#     destination_path = '{}/hmmsearch_blast_{}.txt'.format(blast_out_dir_rem, dest_filename)
    
#     # Run hmmsearch command using ssh without redirection
# #     !ssh {name}@{server} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {source_path} /db/rp/rp-seqs-75.fasta.gz > {destination_path} 2>&1"
#     command = 'ssh {}@{} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {} /db/rp/rp-seqs-75.fasta.gz > {} 2>&1"'.format(name, server, source_path, destination_path)
#     !{command}   
        
#     print('hmmsearch completed for {} - Results saved to {}'.format(filename, destination_path))

In [99]:
# # Copy BLAST results to the local folder
hmmsearch_dir_blast = '{}/results/hmms/hmmsearch/blast'.format(directory)

# for filename in file_list:
#     filename = filename.strip()
#     source_path = '{}/{}'.format(blast_in_dir_rem, filename)
#     dest_filename = os.path.splitext(filename)[0]
    
#     # Copy files
#     !scp {name}@{server}:~/hmm/blast/hmmsearch/hmmsearch_blast_{dest_filename}.txt {hmmsearch_dir_blast}

In [100]:
# Construct the source and the target directories for ClustalOmega
clustal_in_dir_rem = '/home/{}/hmm/clustal/hmmbuild'.format(name)
clustal_out_dir_rem = '/home/{}/hmm/clustal/hmmsearch'.format(name)

# List files in the source directory on the remote machine
ls_command = 'ssh {}@{} "ls {}"'.format(name, server, clustal_in_dir_rem)
file_list = get_ipython().getoutput(ls_command)

# # Iterate over each filename
# for filename in file_list:
#     filename = filename.strip()
#     source_path = '{}/{}'.format(clustal_in_dir_rem, filename)
#     dest_filename = os.path.splitext(filename)[0]
#     destination_path = '{}/hmmsearch_clustal_{}.txt'.format(clustal_out_dir_rem, dest_filename)
    
#     # Run hmmsearch command using ssh without redirection
# #     !ssh {name}@{server} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {source_path} /db/rp/rp-seqs-75.fasta.gz > {destination_path} 2>&1"
#     command = 'ssh {}@{} "/software/packages/hmmer/hmmer-3.3.2/usr/bin/hmmsearch {} /db/rp/rp-seqs-75.fasta.gz > {} 2>&1"'.format(name, server, source_path, destination_path)
#     !{command}     
#     print('hmmsearch completed for {} - Results saved to {}'.format(filename, destination_path))

In [101]:
# # Copy ClustalOmega results to the local folder
hmmsearch_dir_clustal = '{}/results/hmms/hmmsearch/clustal'.format(directory)

# for filename in file_list:
#     filename = filename.strip()
#     source_path = '{}/{}'.format(clustal_in_dir_rem, filename)
#     dest_filename = os.path.splitext(filename)[0]
    
#     # Copy files
#     !scp {name}@{server}:~/hmm/clustal/hmmsearch/hmmsearch_clustal_{dest_filename}.txt {hmmsearch_dir_clustal}

In [102]:
# Preprocess hmmsearch results - BLAST
hmmsearch_dir_blast = '{}/results/hmms/hmmsearch/blast'.format(directory)
hmmsearch_dir_blast_prepr_output = '{}/results/hmms/hmmsearch/blast/hmmsearch_df'.format(directory)

blast_results_list = []

for filename in os.listdir(hmmsearch_dir_blast):
    if filename.endswith('.txt'):
        file_path = os.path.join(hmmsearch_dir_blast, filename)
        parts = filename.split('_')
        protein_id = '{}_{}'.format(parts[2], parts[3].split(".")[0])
        try:
            blast_stat = process_hmmsearch_file(file_path) 
            blast_reg = extract_table_from_output(file_path)
            blast_result = pd.merge(blast_stat, blast_reg, left_on='Sequence', right_on='id', how='inner')
            blast_result = blast_result.drop(columns=['Description', 'id'])
            
            # Save individual DataFrames to a list
            blast_results_list.append(blast_result)
        
            # Save the DataFrame to a CSV file
            output_file = os.path.join(hmmsearch_dir_blast_prepr_output, 'hmmsearch_blast_df_{}.csv'.format(protein_id))
            blast_result.to_csv(output_file, index=False)
            
        except ValueError as e:
            print('Error processing file {}: {}'.format(filename, e))
            
# Merge all DataFrames into one
hmmsearch_results_blast = pd.concat(blast_results_list, ignore_index=True)

# Save the merged DataFrame to a CSV file
merged_output_file = os.path.join('{}/results/hmms/hmmsearch/'.format(directory), 'hmmsearch_results_blast.csv')
hmmsearch_results_blast.to_csv(merged_output_file, index=False)
print('The number of retrieved hmmsearch results for BLAST: {}'.format(len(hmmsearch_results_blast)))
hmmsearch_results_blast.head()

Error processing file hmmsearch_blast_O15922_230-240.txt: '  ------ inclusion threshold ------\n' is not in list
Error processing file hmmsearch_blast_O15922_315-324.txt: '  ------ inclusion threshold ------\n' is not in list
Error processing file hmmsearch_blast_J8TM36_236-249.txt: '  ------ inclusion threshold ------\n' is not in list
Error processing file hmmsearch_blast_Q5T4W7_108-120.txt: '  ------ inclusion threshold ------\n' is not in list
The number of retrieved hmmsearch results for BLAST: 186445


Unnamed: 0,E-value,score,bias,E-value.1,score.1,bias.1,exp,N,Sequence,hmm_from,hmm_to,hmm_length,ali_from,ali_to,ali_length,env_from,env_to,env_length
0,0,5406.7,5695.0,3.6e-05,36.0,9.0,378.4,365,A0A7M7NU54,1,35,35,246,282,37,246,283,38
1,0,2473.7,2447.2,3.9e-05,35.9,5.8,148.9,146,A0A7I8W901,1,34,34,273,306,34,273,308,36
2,0,2023.3,1971.1,8.5e-06,38.0,8.9,138.2,128,A0A8B7ZVU6,1,32,32,454,485,32,454,487,34
3,0,2013.1,1940.1,8.4e-06,38.0,8.9,136.2,127,A0A8B7ZRF1,1,32,32,382,413,32,382,415,34
4,0,1740.0,1752.7,7.7e-06,38.1,8.9,122.2,112,A0A8B7ZPY7,1,32,32,454,485,32,454,487,34


In [103]:
# Preprocess hmmsearch results - ClustalOmega
hmmsearch_dir_clustal = '{}/results/hmms/hmmsearch/clustal'.format(directory)
hmmsearch_dir_clustal_prepr_output = '{}/results/hmms/hmmsearch/clustal/hmmsearch_df'.format(directory)

clustal_results_list = []

for filename in os.listdir(hmmsearch_dir_clustal):
    if filename.endswith('.txt'):
        file_path = os.path.join(hmmsearch_dir_clustal, filename)
        parts = filename.split('_')
        protein_id = '{}_{}'.format(parts[2], parts[3].split(".")[0])
        try:
            clustal_stat = process_hmmsearch_file(file_path) 
            clustal_reg = extract_table_from_output(file_path)
            clustal_result = pd.merge(clustal_stat, clustal_reg, left_on='Sequence', right_on='id', how='inner')
            clustal_result = clustal_result.drop(columns=['Description', 'id'])
            
            # Save individual DataFrames to a list
            clustal_results_list.append(clustal_result)
        
            # Save the DataFrame to a CSV file
            output_file = os.path.join(hmmsearch_dir_clustal_prepr_output, 'hmmsearch_clustal_df_{}.csv'.format(protein_id))
            clustal_result.to_csv(output_file, index=False)
            
        except ValueError as e:
            print('Error processing file {}: {}'.format(filename, e))
            
# Merge all DataFrames into one
hmmsearch_results_clustal = pd.concat(clustal_results_list, ignore_index=True)

# Save the merged DataFrame to a CSV file
merged_output_file = os.path.join('{}/results/hmms/hmmsearch/'.format(directory), 'hmmsearch_results_clustal.csv')
hmmsearch_results_clustal.to_csv(merged_output_file, index=False)
print('The number of retrieved hmmsearch results for ClustalOmega: {}'.format(len(hmmsearch_results_clustal)))
hmmsearch_results_clustal.head()

Error processing file hmmsearch_clustal_J8TM36_236-249.txt: '  ------ inclusion threshold ------\n' is not in list
Error processing file hmmsearch_clustal_O15922_315-324.txt: '  ------ inclusion threshold ------\n' is not in list
Error processing file hmmsearch_clustal_Q5T4W7_108-120.txt: '  ------ inclusion threshold ------\n' is not in list
The number of retrieved hmmsearch results for ClustalOmega: 206952


Unnamed: 0,E-value,score,bias,E-value.1,score.1,bias.1,exp,N,Sequence,hmm_from,hmm_to,hmm_length,ali_from,ali_to,ali_length,env_from,env_to,env_length
0,7.1e-32,121.2,13.5,8.1e-32,121.0,13.5,1.0,1,A0A8C7BFT3,1,61,61,9,69,61,9,69,61
1,1.2000000000000001e-30,117.3,13.5,5.9000000000000005e-30,115.1,13.5,2.3,1,A0A2U3ZCX9,1,61,61,451,511,61,451,511,61
2,1.2000000000000001e-30,117.2,13.5,5.9000000000000005e-30,115.0,13.5,2.3,1,A0A2U3VQT6,1,61,61,454,514,61,454,514,61
3,1.3000000000000001e-30,117.2,13.5,5.9000000000000005e-30,115.1,13.5,2.2,1,A0A2Y9JGP3,1,61,61,450,510,61,450,510,61
4,1.3000000000000001e-30,117.1,13.5,5.9000000000000005e-30,115.1,13.5,2.2,1,A0A7N5P5S9,1,61,61,451,511,61,451,511,61


In terms of region we are interested in the 3 fields:
- `hmm_from`-`hmm_to`: the endpoints of the hmm profile. In our case the beginning is usually 1, since we parse not the initial alignment as an input, but the separate disordered regions where the position mostly starts from 1 (in case there are no gaps in the subject sequences).
- `ali_from`-`ali_to`: the endpoints of the target sequence. They are obtained by matching HMM to the sequence, thus obtaining the positions.
- `env_from`-`env_to`: the envelope of the domain's location. It's usually a bit wider than the alignment.

## 3. Pfam

In [104]:
# # Copy the files with the hmmsearch statistics to the remote computer
# !scp {directory}/results/hmms/hmmsearch/hmmsearch_results_blast.csv {name}@{server}:~/stats/hmmsearch_results_blast.csv
# !scp {directory}/results/hmms/hmmsearch/hmmsearch_results_clustal.csv {name}@{server}:~/stats/hmmsearch_results_clustal.csv

The file `filtered.tsv.gz` contains only the domains from Interpro. We check the overlap of the found results from `hmmsearch`.

In [105]:
# # Check the overlaps with Interpro domains (BLAST)
# !ssh {name}@{server} '/home/alina/protein2ipr.py /home/alina/stats/blast /home/alina/filtered.tsv.gz protein2ipr_blast.tsv'

In [106]:
# # # Copy the files to the local folder
# !scp {name}@{server}:~/protein2ipr_blast.tsv {directory}/results/pfam

In [107]:
# # Check the overlaps with Interpro domains (ClustalOmega)
# !ssh {name}@{server} '/home/alina/protein2ipr.py /home/alina/stats/clustal /home/alina/filtered.tsv.gz protein2ipr_clustal.tsv'

In [108]:
# # Copy the files to the local folder
# !scp {name}@{server}:~/protein2ipr_clustal.tsv {directory}/results/pfam

## 4. Results preprocessing

In [109]:
# Filter only entries with Pfam ID and intercepting regions with the curated_disprot instances - BLAST
filename_blast = '{}/results/pfam/protein2ipr_blast.tsv'.format(directory)
pfam_blast = pfam_processing(filename_blast)

print('The number of retrieved Pfam instances for BLAST: {}'.format(len(pfam_blast)))
pfam_blast.head()

The number of retrieved Pfam instances for BLAST: 927771


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam
0,A0A010Q304,PF02775,IPR011766,499,646,148
1,A0A010Q304,PF00205,IPR012000,289,434,146
2,A0A010Q304,PF02776,IPR012001,91,205,115
3,A0A010Q7P7,PF00018,IPR001452,435,481,47
4,A0A010Q7P7,PF03114,IPR004148,113,226,114


In [110]:
# Filter only entries with Pfam ID and intercepting regions with the curated_disprot instances - ClustalOmega
filename_clustal = '{}/results/pfam/protein2ipr_clustal.tsv'.format(directory)
pfam_clustal = pfam_processing(filename_clustal)

print('The number of retrieved Pfam instances for ClustalOmega: {}'.format(len(pfam_clustal)))
pfam_clustal.head()

The number of retrieved Pfam instances for ClustalOmega: 1073417


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam
0,A0A010Q304,PF02775,IPR011766,499,646,148
1,A0A010Q304,PF00205,IPR012000,289,434,146
2,A0A010Q304,PF02776,IPR012001,91,205,115
3,A0A010Q7P7,PF00018,IPR001452,435,481,47
4,A0A010Q7P7,PF03114,IPR004148,113,226,114


In [111]:
# Merge hmmsearch results with Pfam - BLAST
pfam_blast = pd.merge(pfam_blast, 
                      hmmsearch_results_blast[['Sequence', 
                                               'hmm_from', 'hmm_to', 'hmm_length',
                                               'ali_from', 'ali_to', 'ali_length',
                                               'env_from', 'env_to', 'env_length']], 
                      left_on='uniprot_id', right_on='Sequence', how='left')

pfam_blast = pfam_blast.dropna(axis=0)
pfam_blast = pfam_blast.drop(columns='Sequence')
print('The number of retrieved Pfam instances for BLAST: {}'.format(len(pfam_blast)))
pfam_blast.head()

The number of retrieved Pfam instances for BLAST: 949687


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,hmm_from,hmm_to,hmm_length,ali_from,ali_to,ali_length,env_from,env_to,env_length
0,A0A010Q304,PF02775,IPR011766,499,646,148,1,39,39,650,688,39,650,689,40
1,A0A010Q304,PF02775,IPR011766,499,646,148,1,15,15,583,597,15,583,597,15
2,A0A010Q304,PF00205,IPR012000,289,434,146,1,39,39,650,688,39,650,689,40
3,A0A010Q304,PF00205,IPR012000,289,434,146,1,15,15,583,597,15,583,597,15
4,A0A010Q304,PF02776,IPR012001,91,205,115,1,39,39,650,688,39,650,689,40


In [112]:
# Merge hmmsearch results with Pfam - ClustalOmega
pfam_clustal = pd.merge(pfam_clustal, 
                        hmmsearch_results_clustal[['Sequence', 
                                                   'hmm_from', 'hmm_to', 'hmm_length',
                                                   'ali_from', 'ali_to', 'ali_length',
                                                   'env_from', 'env_to', 'env_length']], 
                        left_on='uniprot_id', right_on='Sequence', how='left')

pfam_clustal = pfam_clustal.dropna(axis=0)
pfam_clustal = pfam_clustal.drop(columns='Sequence')
print('The number of retrieved Pfam instances for ClustalOmega: {}'.format(len(pfam_clustal)))
pfam_clustal.head()

The number of retrieved Pfam instances for ClustalOmega: 1109890


Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,hmm_from,hmm_to,hmm_length,ali_from,ali_to,ali_length,env_from,env_to,env_length
0,A0A010Q304,PF02775,IPR011766,499,646,148,1,15,15,583,597,15,583,597,15
1,A0A010Q304,PF02775,IPR011766,499,646,148,1,39,39,650,688,39,650,689,40
2,A0A010Q304,PF00205,IPR012000,289,434,146,1,15,15,583,597,15,583,597,15
3,A0A010Q304,PF00205,IPR012000,289,434,146,1,39,39,650,688,39,650,689,40
4,A0A010Q304,PF02776,IPR012001,91,205,115,1,15,15,583,597,15,583,597,15


In [113]:
# Function to calculate overlaps between DisProt-HMM and Pfam-HMM
def pfam_hmm_overlap(row_pfam):
    # Calculate the overlap of hmm (ali in this case) and the Pfam
    overlap_len = min(row_pfam['end_pfam'], row_pfam['ali_to']) - max(row_pfam['start_pfam'], 
                                                                      row_pfam['ali_from']) + 1 
    overlap_pfam = overlap_len / row_pfam['length_pfam'] * 100 # % of length overlap with Pfam
    overlap_ali = overlap_len / row_pfam['ali_length'] * 100 # % of length overlap with HMM (ali)
    max_length = row_pfam['length_pfam'] + row_pfam['ali_length'] - overlap_len # the whole region covered by hmm and pfam
    non_overlap_len = max_length - overlap_len + 1 # non covered region

    if overlap_len > 0:
        overlap_perc = (overlap_len / max_length) * 100 # which region
        overlap_sym = 2 * overlap_len / (row_pfam['ali_length'] + row_pfam['length_pfam'])
    else:
        overlap_perc = 0
        overlap_sym = 0

    return overlap_len, overlap_pfam, overlap_ali, non_overlap_len, overlap_perc, overlap_sym

In [114]:
# Add the overlaps to the dataframe - BLAST
overlap_pfam_hmm = []

for index_pfam, row_pfam in pfam_blast.iterrows():
    overl_len, overl_pfam, overl_ali, non_overl_len, overl_perc, overlap_sym = pfam_hmm_overlap(row_pfam)
    overl_pfam = round(overl_pfam, 2)
    overl_ali = round(overl_ali, 2)
    overl_perc = round(overl_perc, 2)
    overlap_sym = round(overlap_sym, 2)
    overlap_pfam_hmm.append((overl_len, overl_pfam, overl_ali, non_overl_len, overl_perc, overlap_sym))

# Extract overlap_pfam and overlap_hmm from the list of tuples
pfam_blast['overl_len'], pfam_blast['overl_pfam'], pfam_blast['overl_ali'], pfam_blast['non_overl_len'], pfam_blast['overl_perc'], pfam_blast['overlap_sym'] = zip(*overlap_pfam_hmm)
pfam_blast.to_csv(f'results/pfam/pfam_overlap/pfam_blast.csv', index=False)
pfam_blast.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,hmm_from,hmm_to,hmm_length,ali_from,...,ali_length,env_from,env_to,env_length,overl_len,overl_pfam,overl_ali,non_overl_len,overl_perc,overlap_sym
0,A0A010Q304,PF02775,IPR011766,499,646,148,1,39,39,650,...,39,650,689,40,-3,-2.03,-7.69,194,0.0,0.0
1,A0A010Q304,PF02775,IPR011766,499,646,148,1,15,15,583,...,15,583,597,15,15,10.14,100.0,134,10.14,0.18
2,A0A010Q304,PF00205,IPR012000,289,434,146,1,39,39,650,...,39,650,689,40,-215,-147.26,-551.28,616,0.0,0.0
3,A0A010Q304,PF00205,IPR012000,289,434,146,1,15,15,583,...,15,583,597,15,-148,-101.37,-986.67,458,0.0,0.0
4,A0A010Q304,PF02776,IPR012001,91,205,115,1,39,39,650,...,39,650,689,40,-444,-386.09,-1138.46,1043,0.0,0.0


In [115]:
pfam_blast[pfam_blast['overl_len'] != 0]

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,hmm_from,hmm_to,hmm_length,ali_from,...,ali_length,env_from,env_to,env_length,overl_len,overl_pfam,overl_ali,non_overl_len,overl_perc,overlap_sym
0,A0A010Q304,PF02775,IPR011766,499,646,148,1,39,39,650,...,39,650,689,40,-3,-2.03,-7.69,194,0.00,0.00
1,A0A010Q304,PF02775,IPR011766,499,646,148,1,15,15,583,...,15,583,597,15,15,10.14,100.00,134,10.14,0.18
2,A0A010Q304,PF00205,IPR012000,289,434,146,1,39,39,650,...,39,650,689,40,-215,-147.26,-551.28,616,0.00,0.00
3,A0A010Q304,PF00205,IPR012000,289,434,146,1,15,15,583,...,15,583,597,15,-148,-101.37,-986.67,458,0.00,0.00
4,A0A010Q304,PF02776,IPR012001,91,205,115,1,39,39,650,...,39,650,689,40,-444,-386.09,-1138.46,1043,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
949682,X6N7N7,PF00018,IPR001452,132,177,46,10,66,57,55,...,58,52,116,65,-19,-41.30,-32.76,143,0.00,0.00
949683,X6N7N7,PF02209,IPR003128,314,349,36,10,66,57,55,...,58,52,116,65,-201,-558.33,-346.55,497,0.00,0.00
949684,X6NHX1,PF00018,IPR001452,41,85,45,6,66,61,30,...,61,25,93,69,45,100.00,73.77,17,73.77,0.85
949685,X6NKT1,PF14604,IPR001452,10,56,47,15,69,55,6,...,55,4,61,58,47,100.00,85.45,9,85.45,0.92


In [116]:
# Add the overlaps to the dataframe - ClustalOmega
overlap_pfam_hmm = []

for index_pfam, row_pfam in pfam_clustal.iterrows():
    overl_len, overl_pfam, overl_ali, non_overl_len, overl_perc, overlap_sym = pfam_hmm_overlap(row_pfam)
    overl_pfam = round(overl_pfam, 2)
    overl_ali = round(overl_ali, 2)
    overl_perc = round(overl_perc, 2)
    overlap_sym = round(overlap_sym, 2)
    overlap_pfam_hmm.append((overl_len, overl_pfam, overl_ali, non_overl_len, overl_perc, overlap_sym))

# Extract overlap_pfam and overlap_hmm from the list of tuples
pfam_clustal['overl_len'], pfam_clustal['overl_pfam'], pfam_clustal['overl_ali'], pfam_clustal['non_overl_len'], pfam_clustal['overl_perc'], pfam_clustal['overlap_sym'] = zip(*overlap_pfam_hmm)
pfam_clustal.to_csv(f'results/pfam/pfam_overlap/pfam_clustal.csv', index=False)
pfam_clustal.head()

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,hmm_from,hmm_to,hmm_length,ali_from,...,ali_length,env_from,env_to,env_length,overl_len,overl_pfam,overl_ali,non_overl_len,overl_perc,overlap_sym
0,A0A010Q304,PF02775,IPR011766,499,646,148,1,15,15,583,...,15,583,597,15,15,10.14,100.0,134,10.14,0.18
1,A0A010Q304,PF02775,IPR011766,499,646,148,1,39,39,650,...,39,650,689,40,-3,-2.03,-7.69,194,0.0,0.0
2,A0A010Q304,PF00205,IPR012000,289,434,146,1,15,15,583,...,15,583,597,15,-148,-101.37,-986.67,458,0.0,0.0
3,A0A010Q304,PF00205,IPR012000,289,434,146,1,39,39,650,...,39,650,689,40,-215,-147.26,-551.28,616,0.0,0.0
4,A0A010Q304,PF02776,IPR012001,91,205,115,1,15,15,583,...,15,583,597,15,-377,-327.83,-2513.33,885,0.0,0.0


In [117]:
pfam_clustal[pfam_clustal['overl_len'] != 0]

Unnamed: 0,uniprot_id,pfam_id,ipr_id,start_pfam,end_pfam,length_pfam,hmm_from,hmm_to,hmm_length,ali_from,...,ali_length,env_from,env_to,env_length,overl_len,overl_pfam,overl_ali,non_overl_len,overl_perc,overlap_sym
0,A0A010Q304,PF02775,IPR011766,499,646,148,1,15,15,583,...,15,583,597,15,15,10.14,100.00,134,10.14,0.18
1,A0A010Q304,PF02775,IPR011766,499,646,148,1,39,39,650,...,39,650,689,40,-3,-2.03,-7.69,194,0.00,0.00
2,A0A010Q304,PF00205,IPR012000,289,434,146,1,15,15,583,...,15,583,597,15,-148,-101.37,-986.67,458,0.00,0.00
3,A0A010Q304,PF00205,IPR012000,289,434,146,1,39,39,650,...,39,650,689,40,-215,-147.26,-551.28,616,0.00,0.00
4,A0A010Q304,PF02776,IPR012001,91,205,115,1,15,15,583,...,15,583,597,15,-377,-327.83,-2513.33,885,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109885,X7YG90,PF02776,IPR012001,35,150,116,2,15,14,507,...,14,507,520,14,-356,-306.90,-2542.86,843,0.00,0.00
1109886,X7ZSI2,PF02775,IPR011766,422,575,154,2,15,14,507,...,14,507,520,14,14,9.09,100.00,141,9.09,0.17
1109887,X7ZSI2,PF00205,IPR012000,224,359,136,2,15,14,507,...,14,507,520,14,-147,-108.09,-1050.00,445,0.00,0.00
1109888,X7ZSI2,PF02776,IPR012001,35,150,116,2,15,14,507,...,14,507,520,14,-356,-306.90,-2542.86,843,0.00,0.00
