-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
126 lines (98 loc) · 4.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from Bio import SeqIO
from Bio import Align
import streamlit as st
import re
# SVE (Solveig, Vebjørn and Emma) Alignment Tool
# This tool is designed to find the closest alignment to a query sequence in a fasta file
# Use pip install -r requirements.txt to install required packages
# Maximum number of sequences to align
sequence_cap = 10000
# Maximum number of alignments for each sequence
alignment_cap = 50000
# Regex pattern to find sequence name in description
pattern = r'OS=(.*?)\sOX='
# Set page config
st.set_page_config(layout="wide")
def read_fasta(filename):
# Read fasta file and return dict of sequences
sequences = {}
for record in SeqIO.parse(filename, "fasta"):
match = re.search(pattern, record.description)
if match:
sequences[record.id] = [str(record.seq), match.group(1)]
else:
sequences[record.id] = [str(record.seq), record.description]
return sequences
def find_closest_alignment(sequences, query_sequence):
# Find the closest alignment to the query sequence in the sequences
# Stores aligment parameters
aligner = Align.PairwiseAligner()
aligner.substitution_matrix = Align.substitution_matrices.load("BLOSUM62")
aligner.mode="local"
best_alignments = []
best_score = float("-inf")
print("Finding best alignment, please wait...")
# Find alignment with highest score
counter = 0
# Progress bar
progress_text = "Finding best alignment. Please wait..."
progress_bar = st.progress(0, text=progress_text)
for identifier, sequence in sequences.items():
try:
# Use sequence[0] as sequence[1] is the name
alignments = aligner.align(sequence[0], query_sequence) #Alignment objects represtenting alignments between sequence and query.
except: # If the alignment score is too high, it will cause an overflow error, so we throw it away and continue
continue
# Create for-loop counter and show progress
counter += 1
progress_bar.progress(counter / len(sequences), text=progress_text)
try:
if 0 < len(alignments) < alignment_cap:
best_alignment = None
for alignment in alignments:
if alignment.score >= best_score:
best_alignment = alignment
if alignment.score > best_score:
best_score = alignment.score
best_alignments = []
if best_alignment:
best_alignments.append(best_alignment)
except: # If the length of the alignments is too high, it will cause an overflow error, but that also indicates that the alignment is not useful, so we throw it away and continue
continue
# Remove progress bar and show completion
progress_bar.empty()
st.balloons()
st.markdown(":blue[Alignment complete!]")
result = [best_alignments, best_score]
return result
if __name__ == "__main__":
st.header("SVE Protein Alignment Tool :rocket:")
st.markdown("*This is an alignment tool that finds the closest protein alignments in a fasta file to a given query sequence*")
fasta_file = "uniprot_fasta/uniprot_sprot.fasta"
st.markdown(":blue[Reading fasta file: " + fasta_file + "]")
sequences = read_fasta(fasta_file)
seq_amount = len(sequences)
st.markdown("**Number of Sequences:** " + str(seq_amount))
# Spacing
st.text("")
# Give user option to use a subset of sequences if there are more than sequence_cap sequences
if seq_amount > sequence_cap:
st.markdown(f":red[WARNING: More than {sequence_cap} sequences! Recommended to use a subset.]") # With a huge amount of sequences, the alignment process will take a long time
# Give user option to use their chosen amount of sequences
seq_slider = st.slider("Select the wanted amount of sequences: ", 0, seq_amount, sequence_cap, 50)
sequences = {k: sequences[k] for k in list(sequences)[:seq_slider]}
st.markdown("**" + str(seq_slider) + " sequences will be used for alignment.**")
# Spacing
st.text("")
# Give user option to enter query sequence
user_sequence = st.text_input("Enter query sequence (or use the default sequence in place): ", "REHSYWDSWSHKSMWYDDGCACPFGNNLHFHHPWANNYSCLTRIKFVIFM")
# Start alignment
if st.button("Find Closest Alignment", type="primary"):
result_list = find_closest_alignment(sequences, user_sequence)
len_result = len(result_list[0])
st.markdown(f"**{len_result} alignment(s) found with score: {str(result_list[1])}**")
# Show results
st.subheader("Best Alignment(s):")
for i in range(len(result_list[0])):
st.markdown(f"{sequences[list(sequences.keys())[i]][1]} - {list(sequences.keys())[i]}")
st.code(str(result_list[0][i]))