In [1]:
import re
import subprocess
import pandas as pd

In [101]:
# Path and input parameters
local_path = "/Users/alina/HMM/"
database_path = "/db/uniprot/uniref90.fasta"
q_id = "Q92835"
hmm_file = f'{local_path}outputs/hmms/{q_id}.hmm'
align_file = f'{local_path}outputs/alignments/tcoffee/{q_id}_tcoffee.fasta'

In [102]:
# Build HMM with HMMER
# command - profile.hmm - alignment.fasta
!hmmbuild {hmm_file} {align_file}

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             /Users/alina/HMM/outputs/alignments/tcoffee/Q92835_tcoffee.fasta
# output HMM file:                  /Users/alina/HMM/outputs/hmms/Q92835.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     Q92835_tcoffee         198  1189  1179     0.55  0.590 

# CPU time: 0.42u 0.00s 00:00:00.42 Elapsed: 00:00:00.42


Here we print the statistics of generated HMM.
- `eff_nseq`: effective sequence number.
- `M`: the match states.
- `relent`: relative entropy of the match state.
- `info`: mean information content per match state (not so important).
- `p relE`: positional relative entropy (not so important).
- `compKL`: Kullback-Leibler divergence from the average composition.

In [103]:
# Analysis of obtained models
!hmmstat {hmm_file}

# hmmstat :: display summary statistics for a profile file
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
# idx  name                 accession        nseq eff_nseq      M relent   info p relE compKL
# ---- -------------------- ------------ -------- -------- ------ ------ ------ ------ ------
1      Q92835_clustal       -                 198     0.56   1174   0.59   0.58   0.52   0.01


In [104]:
# Compress the file in a binary output
!hmmpress {hmm_file}

SSI index file /Users/alina/HMM/outputs/hmms/Q92835.hmm.h3i already exists;
Delete old hmmpress indices first


## 1. HMMsearch

In [105]:
# Transforming the database in a binary output
# !hmmpress {local_path}databases/Pfam_HMM/Pfam-A.hmm

In [106]:
# HMM search against Pfam
!hmmsearch {hmm_file} {local_path}databases/Pfam-A.seed > hmmsearch_pfam.txt # search against the alignments database
# !hmmsearch {hmm_file} {local_path}databases/Pfam_HMM/Pfam-A.hmm > hmmsearch_pfam.txt # search against the hmm database

In [107]:
hmmsearch_pfam = pd.read_csv("hmmsearch_pfam.txt", sep="\t")
header = hmmsearch_pfam.iloc[:9]
stat_table = hmmsearch_pfam.iloc[9:84]
ali = hmmsearch_pfam.iloc[84:]
stat_table

Unnamed: 0,# hmmsearch :: search profile(s) against a sequence database
9,--- full sequence --- --- best 1 domain -...
10,E-value score bias E-value score bi...
11,------- ------ ----- ------- ------ ---...
12,1.5e-219 737.8 0.7 1.7e-219 737.7 0...
13,9e-140 473.6 0.1 1e-139 473.5 0...
...,...
79,8*********************...
80,Q92835_tcoffee 500 rishictdnvktgiantlGnkg...
81,rishictdnvktgiantlGnkg...
82,SHIP1_RAT/411-703 93 RISHICTDNVKTGIANTLGNKG...


In [108]:
# HMM search against Uniref90
!hmmsearch -o hmmsearch_output.txt {hmm_file} {database_path}


Error: Failed to open sequence file /db/uniprot/uniref90.fasta for reading



## 2. HHblits

HHblits is used for profile-profile sequence alignment. It compares a profile against a target sequence database to find homologous sequences.

In [109]:
# HHblits against Pfam
# !hhblits -i {hmm_file} -d {local_path}databases/pfam -o hhblits_pfam.txt # on hmm
!hhblits -i {align_file} -d {local_path}databases/pfam -o hhblits_pfam.txt # on alignment

- 15:33:53.850 INFO: Searching 19632 column state sequences.

- 15:33:54.462 INFO: /Users/alina/HMM/outputs/alignments/tcoffee/Q92835_tcoffee.fasta is in A2M, A3M or FASTA format

- 15:33:54.477 INFO: Iteration 1

- 15:33:55.008 INFO: Prefiltering database

- 15:33:56.033 INFO: HMMs passed 1st prefilter (gapless profile-profile alignment)  : 727

- 15:33:56.065 INFO: HMMs passed 2nd prefilter (gapped profile-profile alignment)   : 305

- 15:33:56.065 INFO: HMMs passed 2nd prefilter and not found in previous iterations : 305

- 15:33:56.065 INFO: Scoring 305 HMMs using HMM-HMM Viterbi alignment

- 15:33:56.127 INFO: Alternative alignment: 0

- 15:34:03.864 INFO: 305 alignments done

- 15:34:03.864 INFO: Alternative alignment: 1

- 15:34:06.155 INFO: 79 alignments done

- 15:34:06.155 INFO: Alternative alignment: 2

- 15:34:06.676 INFO: 8 alignments done

- 15:34:06.676 INFO: Alternative alignment: 3

- 15:34:06.869 INFO: 1 alignments done

- 15:34:08.852 

- `Hit`: contains information about Pfam identifier (starts with PF...), the abbreviated and full name of the domain.
- `Prob`: the probability of the match between the query sequence and the template sequence.
- `E-value`: the expected number of false positive matches that could occur by chance.
- `P-value`: the probability of obtaining a match with a score as good as or better than the observed score purely by chance.
Similar as `E-value`, the lower `P-value` indicate more significant matches.
- `Score`: the quality of the alignment between the query and template sequences.
- `SS (Secondary Structure)`: the predicted secondary structure of the aligned residues in the template sequence.
- `Cols`: the number of aligned columns or residues in the alignment between the query and template sequences.
- `Query HMM`: indicates position matches within HMM profile (input).
- `Template HMM`: indicates position matches within HMM profile (database).
Usually the length of template HMM is bigger than the length of query HMM.

In [110]:
# Show the output of hhblits
hhblits_pfam = pd.read_csv("hhblits_pfam.txt", sep="\t")
header = hhblits_pfam.iloc[:6]
calculations = hhblits_pfam.iloc[6:17]
ali = hhblits_pfam.iloc[17:]
calculations

Unnamed: 0,"Query Q92835/1-1189 Phosphatidylinositol-3,4,5-trisphosphate 5-phosphatase OS=Papio anubis OX=9555 GN=INPP5D PE=3 SV=3"
6,No Hit Prob E-val...
7,1 PF14633.9 ; SH2_2 ; SH2 domain 89.3 0.0...
8,2 PF14633.9 ; SH2_2 ; SH2 domain 77.5 0.0...
9,3 PF00017.27 ; SH2 ; SH2 domain 24.4 2...
10,4 PF13472.9 ; Lipase_GDSL_2 ; GD 15.2 5...
11,5 PF10847.11 ; DUF2656 ; Protein 12.7 ...
12,6 PF08845.13 ; SymE_toxin ; Toxi 7.8 ...
13,7 PF15717.8 ; PCM1_C ; Pericentr 6.7 ...
14,8 PF04311.16 ; DUF459 ; Protein 4.5 ...
15,9 PF18261.4 ; Rpn9_C ; Rpn9 C-te 4.5 ...


In [111]:
ali

Unnamed: 0,"Query Q92835/1-1189 Phosphatidylinositol-3,4,5-trisphosphate 5-phosphatase OS=Papio anubis OX=9555 GN=INPP5D PE=3 SV=3"
17,No 1
18,>PF14633.9 ; SH2_2 ; SH2 domain
19,Probab=89.30 E-value=0.007 Score=46.57 Alig...
20,Q Q92835/1-1189 9 NITRSKAEELLSRTGKDGSFLVRA...
21,Q Consensus 9 nISRSEAEELLSRgSPpGSFLVRP...
...,...
140,.++++|..+|...++++.+..|+....
141,T Consensus 154 ~f~~~l~~~~~~~~~~~~~~~l~~...
142,T DYH17_HUMAN/30 154 TFLDSLKKFDKEHIPEACLKAFKP...
143,T ss_pred HHHHHHhccCHHhCCHHHHHHHHH...
