In [1]:
#Construct and process multiple sequence alignment for use in sequence-function model
import os
import sys
import pandas as pd
from Bio import AlignIO

os.chdir('/home/azamh/demo/seq_struct_func/msa')

In [2]:
#Combine all sequences into fasta formatted list
asr_seq_annotations = pd.read_excel('../si_data/asr_seq_annotations.xlsx', header = 0, index_col = 0)
asr_seq_annotations

Unnamed: 0,Sequence,2_exp_stereo,2_exp_conversion,2_pred_stereo,2_pred_reactivity,3_exp_stereo,3_exp_conversion,3_pred_stereo,3_pred_reactivity,4_exp_stereo,...,4_pred_stereo,4_pred_reactivity,5_exp_stereo,5_exp_conversion,5_pred_stereo,5_pred_reactivity,average_exp_stereo,average_exp_conversion,average_pred_stereo,average_pred_reactivity
278,LLLLLLLLLLAILGGGPTGLLLGLGLLERGLEYLLYERALPYYGLG...,,,-1,,,,1,,,...,1,,,,1,,,,1,1.0
278a,LLLLLLLLLLAVIGAGPTGLLLALGLLERGLEYLLLEKALPYYGLG...,,,1,,,,-1,,,...,-1,,,,0,,,,-1,0.0
279,EDDSRAPLQVAIIGGGMTGLALALGLLNRDVDFTVYERAATFGELG...,,,1,,,,-1,,,...,1,,,,-1,,,,0,0.0
279a,EDDSRAPLQVAIIGGGMTGLALALGLLNRDVDFTVYERAATFGELG...,,,1,,,,-1,,,...,1,,,,-1,,,,0,0.0
280,NGNSRSPLEVAIVGGGITGLALAVGLLKRNVNFTIYERAASFGELG...,-,0.0,1,0.0,-,0.0,-1,0.0,-,...,1,0.0,-,0.0,1,0.0,-,0.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xp_659718,MTPSTKPKTFHVAIVGGGIAGLSLAIALHHRDVSVKIYEQAHAFAE...,,,1,,,,-1,,,...,-1,,,,-1,,,,-1,0.0
xp_660831,MGSLWSSPSLLPSQQDNETEPFSHLPKEIGTDPTLREDSNVSNRNS...,,,-1,,,,1,,,...,1,,,,-1,,,,0,1.0
xp_660986,MSASTPTVNGTNEPISIAIIGAGIIGTVLALGLTRRKDAFPLPVNV...,,,1,,,,-1,,,...,-1,,,,-1,,,,-1,0.0
xp_681171,MPGTVRPGEPVQVAIIGGGIVGVVLAVGLIRQNVKVRLFEQSQGFR...,,,1,,,,1,,,...,-1,,,,-1,,,,0,1.0


In [3]:
#Write to fasta file
fasta_file = 'alignment/all.fasta'
with open(fasta_file, 'w') as fasta_writer:
    for protein, sequence in zip(asr_seq_annotations.index, asr_seq_annotations['Sequence']):
        fasta_writer.write(f'>{protein}\n{sequence}\n')

In [4]:
#Run clustal omega
aln_file = 'alignment/all.aln'
os.system(f'script/clustalo --force -v -i {fasta_file} -o {aln_file}')

Using 1 threads
Read 830 sequences (type: Protein) from clustal_omega/all.fasta
Using 94 seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of 830 sequences)
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 8.18u 0.01s 00:00:08.19 Elapsed: 00:00:08
mBed created 15 cluster/s (with a minimum of 1 and a soft maximum of 100 sequences each)
Distance calculation within sub-clusters done. CPU time: 3.08u 0.00s 00:00:03.08 Elapsed: 00:00:04
Guide-tree computation (mBed) done.
Progressive alignment progress done. CPU time: 21.62u 2.66s 00:00:24.28 Elapsed: 00:00:24
Alignment written to clustal_omega/all.aln


0

In [5]:
#Create dataframe from alignment using biopython
alignment = AlignIO.read(open(aln_file), "fasta")
print("Alignment length %i" % alignment.get_alignment_length())
seq_dict = dict()
resi_dict = dict()
for record in alignment:
    seq_dict[record.id] = record.seq
    resi_list = []
    i = 1
    for resn in record.seq:
        if resn != '-':
            resi_list.append(i)
            i += 1
        else:
            resi_list.append('-')
    resi_dict[record.id] = resi_list
msa_df = pd.DataFrame.from_records(seq_dict).transpose()
resi_df = pd.DataFrame.from_records(resi_dict).transpose()
msa_df

Alignment length 867


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,857,858,859,860,861,862,863,864,865,866
278,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
278a,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
279,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
279a,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
280,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xp_659718,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
xp_660831,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
xp_660986,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
xp_681171,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [None]:
#Save to excel
msa_df.to_excel('alignment/msa_df.xlsx')
resi_df.to_excel('alignment/resi_df.xlsx')