Mycobacterium tuberculosis (MTB) can persist in a latent state in humans for many years before causing disease. Latency has been found to be linked to hypoxia (lack  of oxygen) in the host. You suspect that genes that are activated in  hypoxia are regulated by a common transcription factor, so you collect  the upstream sequences for all of the MTB genes that are upregulated in  hypoxia, looking for the motif that corresponds to the binding site for  the transcription factor regulating these genes. Your biologist  colleague tells you that you should look at the 250 bp upstream region  of each gene (which have been conveniently compiled for you in a FASTA file named upstream250.txt -- right click and download this file). Your colleague also tells you that the motif is probably about 20 bp long.

In [28]:
import coursea_course.helper.course1_week3 as c1w3
import coursea_course.helper.course1_week4 as c1w4

from Bio import SeqIO

In [32]:
fasta_sequences = SeqIO.parse(open("res/datasets/upstream250.txt"), 'fasta')
sequences = [str(sequence.seq) for sequence in fasta_sequences]

Constants

In [34]:
t = len(sequences)
k = 20
n = len(sequences[0])

RandomMotifSearch

In [46]:
motifs_random = c1w4.randomized_motif_search(sequences, k=k, t=t)
motifs_random

['CGTCCCCAGCCCCAAGGCCG',
 'CGACCACCGCGCCGGAGCCC',
 'GGGCTCCGGCGCGGTGGTCG',
 'CCGCCGTTGGGCCGCGGACT',
 'CCGCGGGCGGCCCGCACGCA',
 'CCGCCGTTGCGCCGGGTGCG',
 'ATGCGGCCGACGAGCGGGCG',
 'CCACCACCGTAGCGCCGCCG',
 'ACCGCCCATCGCCGCGGTCA',
 'CGGCCCCACCCACGAGGCCG',
 'CGTCCGCGACGACGCGTGCG',
 'GCCCGGTCGCCACGCGGCGG',
 'CCGTGGGTGGGCAGCCTCCA',
 'CCGGCGCTTTGGCGAGGCCG',
 'GCAGTGCCGGCCTGAGGGCC',
 'ACACGGCGGGGCCGCGTGAG',
 'CGACCACGTTGTCGAGGCCG',
 'TCATCGCCGCATCGGTGGCA',
 'ACCCCGCGTCGACGTGCCAG',
 'TCTGTCCCTAGCCCTGGCCA',
 'GGGCCATTTGTCCGCGCCCG',
 'GCCGCGCGTCGGCGAGTCCT',
 'CCGACACCTGCCCGAGCACG',
 'GGTCCATCGACCCGCGGCCC',
 'GCCCGGTCGCGCCGCGTCCA',
 'ACGGGGCTTGGTCGCGGCCG',
 'CTTCCGCGTCGTACTGGTCA',
 'TGGGTGCCGTGCCGAAGGCG',
 'CGTCGGCCTCGGCGTCGGCC',
 'CGGTCACCATGTCGCGGGCA',
 'ACGGCTCCTCGCAGCGGAGA',
 'CCTCCGCTGGCGAGCCTTCG',
 'TCGGGCCATCGCCGGCGGCA',
 'CCGCCCCAGCGAAGGAGACG',
 'CCGGCTTGTCGCAGCGGCCA',
 'GCTCCGACGTGCCGGTGCCA']

In [47]:
consensus_random = c1w3.find_consensus(motifs_random)
score_random = c1w3.score(motifs_random)

print(f"{consensus_random} : {score_random}")

CCGCCGCCGCGCCGCGGCCG : 280


GibbsSampler

In [48]:
motifs_gibbs = c1w4.gibbs_sampler(sequences, k=k, t=t, N=2000)
motifs_gibbs

['TTCGTGACCGACGTCCCCAG',
 'CTGTCCGGGCGGGGCGCGGA',
 'CTGGTCGCCACTGGAAAGGG',
 'ATGCCGCCGTTGGGCCGCGG',
 'GTGGTCGCGATCGAACCCGA',
 'CTTTTGGCCACCGGCGCTGG',
 'GTCGTGGTCACTGCGGAGGA',
 'GCGGGCCCGGCCGCCATCGG',
 'GAGGAGCACATGGCCGCCGA',
 'CTGGTGACCACCGCCGACGG',
 'CTGGGGACCGAAGTCCCCGG',
 'CTGGCTGCCCCGGCTGACGG',
 'ATCATCGGCCAGGGCGCCGG',
 'GTGGTCGACAAGGTCGCCGA',
 'GTGGTGCCCACCCGCGCGGA',
 'GCGGCGGCCTTGGCCGCCCG',
 'GTGGTAGCGCCGGTCGATGG',
 'ACGTGGACCACGGTCAGCGG',
 'GTCGGGTGAACCGCCCCGGT',
 'GTGGTCACCATGGTGTCCGG',
 'TCGGGGAAGAGGGACCGCGG',
 'ATCGTGCCGCGCGTCGGCGA',
 'TTGGTCGGAATCGTCACCGA',
 'GAGGTGGCCTACGGCGAGGA',
 'GAGGACGCCATCGGCCGCGA',
 'GTGGCCACTGTCGAGACCGG',
 'CTGGTCAGTCTCGACAGCGA',
 'AGGGTCGCCACGGCTGGCGA',
 'GTCGTCGGCCTCGGCGTCGG',
 'CTGGGCAGCGTTGCACTCGG',
 'CGCGCGTCGACAGCCGCGGT',
 'ATGGTCAGCGCCTTCCCCGG',
 'GATCGGGCCATCGCCGGCGG',
 'GTGGGGACCAACGCCCCTGG',
 'GTGATAACGCGCGGCGCCGG',
 'GTGCCGGTGCCAGCCGCCGC']

In [49]:
consensus_gibbs = c1w3.find_consensus(motifs_gibbs)
score_gibbs = c1w3.score(motifs_gibbs)

print(f"{consensus_gibbs} : {score_gibbs}")

GTGGTGGCCACCGCCGCCGG : 278
