In [1]:
import xml.etree.ElementTree as ElementTree
from xml.dom import minidom
import json
import os
import requests

import pycbbl
from scrapy.http import TextResponse
import numpy as np
from Bio import SeqIO
from Bio import AlignIO
from Bio.PDB.Polypeptide import one_to_three

PyRosetta-4 2020 [Rosetta PyRosetta4.Release.python36.ubuntu 2020.19+release.f98ad046ef76418f1431e66d54e6074e2a0ec48c 2020-05-06T13:59:29] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.
[0mcore.init: [0mChecking for fconfig files in pwd and ./rosetta/flags
[0mcore.init: [0mRosetta version: PyRosetta4.Release.python36.ubuntu r254 2020.19+release.f98ad046ef7 f98ad046ef76418f1431e66d54e6074e2a0ec48c http://www.pyrosetta.org 2020-05-06T13:59:29
[0mcore.init: [0mcommand: PyRosetta -ex1 -ex2aro -database /home/martin/miniconda3/lib/python3.6/site-packages/pyrosetta-2020.19+release.f98ad046ef7-py3.6-linux-x86_64.egg/pyrosetta/database
[0mbasic.random.init_random_generator: [0m'RNG device' seed mode, using '/dev/urandom', seed=-286698352 seed_offset=0 real_seed=-286698352
[0mbasic.random.init_random_generator: [0mRandomGenerator:init: Normal mode, seed=-286698352 RG_type=mt19937
PyRosetta

In [2]:
# Read the students list
with open('../../student_list.json') as jf:
    students = json.load(jf)

In [3]:
Questions = [
    'If you cluster the sequences in fasta file FASTA with CD-HIT at an identity threshold of THRESHOLD,\
 which of the following sequences pertain to the same cluster as sequence IDX?',
    'If you cluster the sequences in fasta file FASTA with CD-HIT at an identity threshold of THRESHOLD,\
 which is the centroid structure of the cluster that contains the sequence IDX?',
    'Which is the length (i.e., character columns) of the multiple sequence alignment contained in the file FASTA.',
    'How many absolutely conserved positions are in the multiple sequence alignment contained in fasta file FASTA?',
    'How many RESIDUE residues are absolutely conserved in the multiple sequence alignment contained in fasta file FASTA?',]

In [4]:
residues = {'ALA': 'Alanine', 'ARG': 'Arginine', 'ASN': 'Asparagine', 'ASP': 'Aspartic', 'CYS': 'Cysteine', 'GLN': 'Glutamine', 'GLU': 'Glutamic', 'GLY': 'Glycine', 'HIS': 'Histidine', 'ILE': 'Isoleucine', 'LEU': 'Leucine', 'LYS': 'Lysine', 'MET': 'Methionine', 'PHE': 'Phenylalanine', 'PRO': 'Proline', 'SER': 'Serine', 'THR': 'Threonine', 'TRP': 'Tryptophan', 'TYR': 'Tyrosine', 'VAL': 'Valine'}

In [5]:
def getBlastSequences(blast, database_fasta):
    
    blast_codes = []
    for s in blast:
        blast_codes.append(s.split('|')[1])
        
    sequences = []
    for record in SeqIO.parse('../uniprot_sprot.fasta.gz', 'fasta'):
        if record.id.split('|')[1] in blast_codes:
            sequences.append(record)
            
    return sequences

In [6]:
def clusterSequences(sequences, c=0.9, return_centroid=False):

    with open('tmp_input.fasta', 'w') as output_file:
        # Write the sequences with BioPython SeqIO.write() method.
        SeqIO.write(sequences, output_file, 'fasta')
    
    command = 'cd-hit -i tmp_input.fasta -o tmp_output.fasta -c '+str(c)
    
    os.system(command)
    
    sequences = SeqIO.parse('tmp_output.fasta', 'fasta')
    count = -1
    clusters = {}
    
    if os.path.exists('tmp_output.fasta.clstr'):
    
        with open('tmp_output.fasta.clstr') as cf:
            for l in cf:
                if l.startswith('>'):
                    count += 1
                    clusters[count] = []
                else:
                    if return_centroid:
                        if '*' in l:
                            clusters[count].append(l.split('|')[1])
                    else:
                        clusters[count].append(l.split('|')[1])
                        

        os.remove('tmp_input.fasta')
        os.remove('tmp_output.fasta.clstr')
        
    else:
        print('Failed to cluster sequences!')
        return None
                
    return sequences, clusters

In [7]:
def multipleSequenceAlignment(input_fasta):
    command = 'mafft --auto '+input_fasta+' > temporary_file.fasta'
    os.system(command)
    msa = AlignIO.read('temporary_file.fasta', 'fasta')
    if os.path.exists('temporary_file.fasta'):
        os.remove('temporary_file.fasta')
    return msa

In [8]:
def getAbsolutelyConservedPositions(msa):

    # Get the length of the alignment
    alignment_lenght = msa.get_alignment_length()

    # Create a list to store the conserved positions indexes
    conserved_indexes = []
    # Create a list to store the conserved positions letters
    conserved_letters = []

    # Get the number of sequences
    n_sequences = len(msa)

    # Iterate all the alignment index positions
    for i in range(alignment_lenght):

        # Define the list to store all the letters in the MSA for position i
        letters = []

        # Iterate all the sequences in the MSA
        for seq_record in msa:

            # Define the current character 
            character = seq_record.seq[i]

            # Store only letter characters
            if character != '-':
                letters.append(character)

        # Define a set of unique letters
        letters_set = set(letters)
        # Store positions with only one letter in the set
        if len(letters_set) == 1:

            # Store positions that are present in all the sequences
            if len(letters) == n_sequences:

                #Append index to the list of conserved indexes
                conserved_indexes.append(i)
                #Append character to the list of conserved letters
                conserved_letters.append(letters[0])
                
    return conserved_indexes, conserved_letters

In [9]:
# Read fasta files in fasta_files folder
fasta_file = {}
if os.path.exists('fasta_files'):
    codes = set()
    for d in os.listdir('fasta_files/'):
        codes.add(d.split('_')[0])

    for code in codes:
        fasta_file[code] = {}
        for f in os.listdir('fasta_files/'):
            if code in f:
                ft = f.split('_')[-1].split('.')[0]
                fasta_file[code][ft] = 'fasta_files/'+f


In [10]:
if len(fasta_file) != len(students):
    
    # Gather UniProt pool of codes
    records = []

    for record in SeqIO.parse('../uniprot_sprot.fasta.gz', 'fasta'):
        # Select sequences between 100 and 200 AA length
        if len(record.seq) >= 80 and len(record.seq) <= 100:
            records.append(record)
    
    random_records = np.random.choice(records, size=20000, replace=False)
    
    # Select unrelated sequences with at least 250 blast results
    sequences = {}
    fasta_file = {}

    n_sequences = len(students)
    min_n_blast_sequences = 250 # Minimum blast results
    max_allowed_pid = 0.3 # Between selected sequences

    for r in random_records:

        # Check that the current sequence is not similar to previously saved sequences
        if sequences != {}:
            prev_sequences = [sequences[s]['sequence'] for s in sequences]
            pid = pycbbl.alignment.blast.calculatePIDs(r.seq, prev_sequences)
            if np.max(pid) > max_allowed_pid:
                print('Rejecting sequence '+r.id+'. Maximum PID to added sequences: '+str(np.max(pid)))
                continue

        # Blast sequence
        blast = pycbbl.alignment.blast.blastDatabase(r.seq, '../uniprot_sprot.fasta.gz', max_target_seqs=min_n_blast_sequences)

        # Add if it has the minimum number of blasted sequences required
        if len(blast) == min_n_blast_sequences:

            upid = r.id.split('|')[1]
            blast_sequences = getBlastSequences(blast, '../uniprot_sprot.fasta.gz')

            try:
                # Write Blast sequences into file
                with open('fasta_files/'+upid+'_BLAST.fasta', 'w') as output_file:
                    SeqIO.write(blast_sequences, output_file, 'fasta')

                # Write clustered sequences
                seqs, clusters = clusterSequences(blast_sequences)

                with open('fasta_files/'+upid+'_CLUSTERS.fasta', 'w') as output_file:
                    SeqIO.write(seqs, output_file, 'fasta')

                msa = multipleSequenceAlignment('fasta_files/'+upid+'_CLUSTERS.fasta')
                with open('fasta_files/'+upid+'_MSA.fasta', 'w') as output_file:
                    SeqIO.write(msa, output_file, 'fasta')

            except:
                
                # Remove any file related to the failed sequence
                print('Something wrong with the sequences. Rejecting '+r.id)

                if os.path.exists('fasta_files/'+upid+'_BLAST.fasta'):
                    os.remove('fasta_files/'+upid+'_BLAST.fasta')

                if os.path.exists('fasta_files/'+upid+'_CLUSTERS.fasta'):
                    os.remove('fasta_files/'+upid+'_CLUSTERS.fasta')

                if os.path.exists('fasta_files/'+upid+'_MSA.fasta'):
                    os.remove('fasta_files/'+upid+'_MSA.fasta')

                continue
                
            conserved_indexes, conserved_letters = getAbsolutelyConservedPositions(msa)
            
            if len(conserved_indexes) < 4:
                print('No more than '+str(len(conserved_indexes))+' conserved positions were found in the MSA for sequence '+upid)
                continue
            
            # Save the name of the fasta files
            fasta_file[upid] = {}
            fasta_file[upid]['BLAST'] = 'fasta_files/'+upid+'_BLAST.fasta'
#             fasta_file[upid]['CLUSTERS'] = 'fasta_files/'+upid+'_CLUSTERS.fasta'
            fasta_file[upid]['MSA'] = 'fasta_files/'+upid+'_MSA.fasta'

            sequences[upid] = {}
            sequences[upid]['sequence'] = r.seq
            print(str(len(sequences))+' of '+str(n_sequences)+' - Added sequence '+r.id)
        else:
            print('Rejecting sequence '+r.id+'. Maximum number of blasted sequences: '+str(len(blast)))

        # Stop when the desired number of sequences has been obtained
        if len(sequences) == n_sequences:
            break

  # This is added back by InteractiveShellApp.init_path()


Something wrong with the sequences. Rejecting sp|Q7V2I5|PHS_PROMP
Something wrong with the sequences. Rejecting sp|Q65SD5|HFQ_MANSM
Something wrong with the sequences. Rejecting sp|Q54RY8|PHS_DICDI
Rejecting sequence sp|P03636|VGC_BPG4. Maximum number of blasted sequences: 14
Rejecting sequence sp|P0C307|YCF70_ORYSI. Maximum number of blasted sequences: 14
Something wrong with the sequences. Rejecting sp|Q9FAS7|URE3_VIBPH
Rejecting sequence sp|P37590|PMRD_ECOLI. Maximum number of blasted sequences: 11
Rejecting sequence sp|Q3IPF4|SRP19_NATPD. Maximum number of blasted sequences: 70
Rejecting sequence sp|P83242|MSMB_STRCA. Maximum number of blasted sequences: 85
Rejecting sequence sp|P51918|GON1_HAPBU. Maximum number of blasted sequences: 58
Rejecting sequence sp|D2Y2G6|H8A11_CYRHA. Maximum number of blasted sequences: 224
Rejecting sequence sp|Q01524|DEF6_HUMAN. Maximum number of blasted sequences: 74
Rejecting sequence sp|A4SQW5|Y3305_AERS4. Maximum number of blasted sequences: 99
Rej

KeyboardInterrupt: 

In [None]:
# Create questions
questions = {}
data = {}

codes = [*fasta_file.keys()]
print(len(codes))
# Iterate questions
for j,q in enumerate(Questions):
    Q = 'Q'+str(j+1)
    questions[Q] = {}
    data[Q] = {}
    
    # Iterate students
    for i,s in enumerate(students):    
    
        if j == 0:
            print(fasta_file[codes[i]]['BLAST'])
            f = fasta_file[codes[i]]['BLAST'].split('/')[1]
            st = float('%.2f' % np.random.uniform(0.7,0.8))
            mq = q.replace('FASTA', f)
            mq = mq.replace('THRESHOLD', str(st))
            data[Q][s] = (f, st)
            questions[Q][s] = mq
            
        if j == 1:
            f = fasta_file[codes[i]]['BLAST'].split('/')[1]
            st = float('%.2f' % np.random.uniform(0.7,0.8))
            mq = q.replace('FASTA', f)
            mq = mq.replace('THRESHOLD', str(st))
            data[Q][s] = (f, st)
            questions[Q][s] = mq
            
        if j == 2:
            f = fasta_file[codes[i]]['MSA'].split('/')[1]
            mq = q.replace('FASTA', f)
            data[Q][s] = f
            questions[Q][s] = mq
            
        if j == 3:
            f = fasta_file[codes[i]]['MSA'].split('/')[1]
            mq = q.replace('FASTA', f)
            data[Q][s] = f
            questions[Q][s] = mq
            
        if j == 4:
            f = fasta_file[codes[i]]['MSA'].split('/')[1]
            mq = q.replace('FASTA', f)
            data[Q][s] = f
            questions[Q][s] = mq

In [None]:
# Create answers
answers = {}
decoys = {}

# Generate answers
for q in questions:
    answers[q] = {}
    decoys[q] = {}
    
    for i,s in enumerate(questions[q]):
        
        if q == 'Q1':
            
            f = 'fasta_files/'+data[q][s][0]
            st = data[q][s][1]
            sequences = SeqIO.parse(f, 'fasta')
            seqs, clusters = clusterSequences(sequences, c=st)
            lc = np.argmax([len(clusters[c]) for c in clusters])
            ts = clusters[lc][0]
            questions[q][s] = questions[q][s].replace('IDX', ts)
            
            nca = np.random.randint(2,5)
            nda = 5-nca
            answers[q][s] = []
            decoys[q][s] = []
            for p in np.random.choice(clusters[lc][1:], size=nca, replace=False):
                answers[q][s].append(p)
            other_clusters = [k for k in clusters.keys() if k != lc]
            other_sequences = []
            for k in other_clusters:
                for x in (clusters[k]):
                    other_sequences.append(x)
                    
            for p in np.random.choice(other_sequences, size=nda, replace=False):
                decoys[q][s].append(p)

        if q == 'Q2':
            
            f = 'fasta_files/'+data[q][s][0]
            st = data[q][s][1]
            sequences = SeqIO.parse(f, 'fasta')
            seqs, clusters = clusterSequences(sequences, c=st)
            sequences = SeqIO.parse(f, 'fasta')
            seqs, centroids = clusterSequences(sequences, c=st, return_centroid=True)
            
            lc = np.argmax([len(clusters[c]) for c in clusters])
            answers[q][s] = centroids[lc][0]
            other_sequences = [s for s in clusters[lc] if s != centroids[lc][0]]
            target = np.random.choice(other_sequences, size=1, replace=False)[0]
            questions[q][s] = questions[q][s].replace('IDX', target)
            
            other_centroids = [centroids[x][0] for x in centroids if centroids[x][0] != centroids[lc][0]]
            decoys[q][s] = np.random.choice(other_centroids, size=4, replace=False)
            
        if q == 'Q3':
            f = 'fasta_files/'+data[q][s]
            msa = AlignIO.read(f, 'fasta')
            answers[q][s] = str(msa.get_alignment_length())
            decoys[q][s] = []
            for x in range(msa.get_alignment_length()-2, msa.get_alignment_length()+3):
                if x != msa.get_alignment_length():
                    decoys[q][s].append(str(x))
                    
        if q == 'Q4':
            
            f = 'fasta_files/'+data[q][s]
            msa = AlignIO.read(f, 'fasta')
            conserved_indexes, conserved_letters = getAbsolutelyConservedPositions(msa)
            answers[q][s] = str(len(conserved_indexes))
            other_lengths = list(set([x for x in range(0, msa.get_alignment_length()) if x != len(conserved_indexes)]))
            decoys[q][s] = np.random.choice(other_lengths, size=4, replace=False)
            
        if q == 'Q5':
            
            f = 'fasta_files/'+data[q][s]
            msa = AlignIO.read(f, 'fasta')
            conserved_indexes, conserved_letters = getAbsolutelyConservedPositions(msa)
            hcl_index = np.argmax([conserved_letters.count(l) for l in set(conserved_letters)])
            hcl = conserved_letters[hcl_index]
            resname = residues[one_to_three(hcl)].lower()
            questions[q][s] = questions[q][s].replace('RESIDUE', resname)
            answers[q][s] = conserved_letters.count(hcl)
            other_counts = list(set([x for x in range(0, answers[q][s]*2) if x != answers[q][s]]))
            decoys[q][s] = np.random.choice(other_counts, size=4, replace=False)


In [None]:
if not os.path.exists('questions'):
    os.mkdir('questions')

# Create XML object
xml = ElementTree
# Add quiz tag
quiz = xml.Element('quiz')

for q in questions:
    # Write questions separated by groupings
    groups = [s  for s in sorted(students, key=lambda x:int(students[x]['Group']))]
    for s in groups:
        # Define question category
        question = xml.SubElement(quiz, 'question')
        question.set('type', 'category')
        category = xml.SubElement(question, 'category')
        text = xml.SubElement(category, 'text')
        text.text = 'P03 - '+students[s]['Group']
        
        if q == 'Q1':
            
            # Define question type
            question = xml.SubElement(quiz, 'question')
            question.set('type', 'multichoice')

            # Define question name
            name = xml.SubElement(question, 'name')
            text = xml.SubElement(name, 'text')
            text.text = 'P03 - '+q+' - '+students[s]['Grouping']+' - Group '+students[s]['Group']

            # Define Question text
            questiontext = xml.SubElement(question, 'questiontext')
            questiontext.set('format', 'plain_text')
            text = xml.SubElement(questiontext, 'text')
            text.text = questions[q][s]

            
            # Define Question Tag
            tag = xml.SubElement(question, 'tag')
            text = xml.SubElement(tag, 'text')
            text.text = q

            # Define answer
            for a in answers[q][s]:
                # Define answers
                answer = xml.SubElement(question, 'answer')
                answer.set('fraction', '100') 
                text = xml.SubElement(answer, 'text')
                text.text = a
                
            for d in decoys[q][s]:
                # Define answers
                answer = xml.SubElement(question, 'answer')
                answer.set('fraction', '0') 
                text = xml.SubElement(answer, 'text')
                text.text = d
                
            # Multiple choice question
            single = xml.SubElement(question, 'multichoice')
            single.text = str(1)
            # Randomize answers
            shuffleanswers = xml.SubElement(question, 'shuffleanswers')
            shuffleanswers.text = str(1)
            
        if q == 'Q2':
            
            # Define question type
            question = xml.SubElement(quiz, 'question')
            question.set('type', 'multichoice')

            # Define question name
            name = xml.SubElement(question, 'name')
            text = xml.SubElement(name, 'text')
            text.text = 'P03 - '+q+' - '+students[s]['Grouping']+' - Group '+students[s]['Group']

            # Define Question text
            questiontext = xml.SubElement(question, 'questiontext')
            questiontext.set('format', 'plain_text')
            text = xml.SubElement(questiontext, 'text')
            text.text = questions[q][s]

            
            # Define Question Tag
            tag = xml.SubElement(question, 'tag')
            text = xml.SubElement(tag, 'text')
            text.text = q

            # Define answers
            answer = xml.SubElement(question, 'answer')
            answer.set('fraction', '100')
            text = xml.SubElement(answer, 'text')
            text.text = answers[q][s]
                
            for d in decoys[q][s]:
                # Define answers
                answer = xml.SubElement(question, 'answer')
                answer.set('fraction', '0') 
                text = xml.SubElement(answer, 'text')
                text.text = d
                
            # Multiple choice question
            single = xml.SubElement(question, 'single')
            single.text = str(1)
            # Randomize answers
            shuffleanswers = xml.SubElement(question, 'shuffleanswers')
            shuffleanswers.text = str(1)
            
        if q == 'Q3':
            
            # Define question type
            question = xml.SubElement(quiz, 'question')
            question.set('type', 'multichoice')

            # Define question name
            name = xml.SubElement(question, 'name')
            text = xml.SubElement(name, 'text')
            text.text = 'P03 - '+q+' - '+students[s]['Grouping']+' - Group '+students[s]['Group']

            # Define Question text
            questiontext = xml.SubElement(question, 'questiontext')
            questiontext.set('format', 'plain_text')
            text = xml.SubElement(questiontext, 'text')
            text.text = questions[q][s]

            
            # Define Question Tag
            tag = xml.SubElement(question, 'tag')
            text = xml.SubElement(tag, 'text')
            text.text = q

            # Define answers
            answer = xml.SubElement(question, 'answer')
            answer.set('fraction', '100') 
            text = xml.SubElement(answer, 'text')
            text.text = answers[q][s]
                
            for d in decoys[q][s]:
                # Define answers
                answer = xml.SubElement(question, 'answer')
                answer.set('fraction', '0') 
                text = xml.SubElement(answer, 'text')
                text.text = d
                
            # Multiple choice question
            single = xml.SubElement(question, 'single')
            single.text = str(1)
            # Randomize answers
            shuffleanswers = xml.SubElement(question, 'shuffleanswers')
            shuffleanswers.text = str(1)
            
        if q == 'Q4':
            
            # Define question type
            question = xml.SubElement(quiz, 'question')
            question.set('type', 'multichoice')

            # Define question name
            name = xml.SubElement(question, 'name')
            text = xml.SubElement(name, 'text')
            text.text = 'P03 - '+q+' - '+students[s]['Grouping']+' - Group '+students[s]['Group']

            # Define Question text
            questiontext = xml.SubElement(question, 'questiontext')
            questiontext.set('format', 'plain_text')
            text = xml.SubElement(questiontext, 'text')
            text.text = questions[q][s]

            
            # Define Question Tag
            tag = xml.SubElement(question, 'tag')
            text = xml.SubElement(tag, 'text')
            text.text = q

            # Define answers
            answer = xml.SubElement(question, 'answer')
            answer.set('fraction', '100') 
            text = xml.SubElement(answer, 'text')
            text.text = answers[q][s]
                
            for d in decoys[q][s]:
                # Define answers
                answer = xml.SubElement(question, 'answer')
                answer.set('fraction', '0') 
                text = xml.SubElement(answer, 'text')
                text.text = d
                
            # Multiple choice question
            single = xml.SubElement(question, 'single')
            single.text = str(1)
            # Randomize answers
            shuffleanswers = xml.SubElement(question, 'shuffleanswers')
            shuffleanswers.text = str(1)
            
        if q == 'Q5':
            
            # Define question type
            question = xml.SubElement(quiz, 'question')
            question.set('type', 'multichoice')

            # Define question name
            name = xml.SubElement(question, 'name')
            text = xml.SubElement(name, 'text')
            text.text = 'P03 - '+q+' - '+students[s]['Grouping']+' - Group '+students[s]['Group']

            # Define Question text
            questiontext = xml.SubElement(question, 'questiontext')
            questiontext.set('format', 'plain_text')
            text = xml.SubElement(questiontext, 'text')
            text.text = questions[q][s]

            
            # Define Question Tag
            tag = xml.SubElement(question, 'tag')
            text = xml.SubElement(tag, 'text')
            text.text = q

            # Define answers
            answer = xml.SubElement(question, 'answer')
            answer.set('fraction', '100')
            text = xml.SubElement(answer, 'text')
            text.text = answers[q][s]
                
            for d in decoys[q][s]:
                # Define answers
                answer = xml.SubElement(question, 'answer')
                answer.set('fraction', '0') 
                text = xml.SubElement(answer, 'text')
                text.text = d
                
            # Multiple choice question
            single = xml.SubElement(question, 'single')
            single.text = str(1)
            # Randomize answers
            shuffleanswers = xml.SubElement(question, 'shuffleanswers')
            shuffleanswers.text = str(1)
            
# Write XML file
xmlstr = minidom.parseString(xml.tostring(quiz)).toprettyxml(indent="  ")
with open('questions/P03_questions.xml', "w") as f:
    f.write(xmlstr)