In [32]:
valid = "ACTATCTACTACTATCTATGCTAGCTAGCTAGCTAGCATCGA"

#Test if a given sequence is a valid DNA sequence
def isValid(seq):
    alphabet = ["A", "T", "G", "C"]
    for base in seq.upper():
        if base not in alphabet:
            return False
    return True

# Test
isValid(valid) # True

True

In [15]:
# Test
isValid("AGCTAGCTGACUGCYAGCACGAYGC")

False

In [31]:
#Calculate the frequency of the symbols in a given sequence
def freq(seq):
    symbols = {}
    for base in seq.upper():
        if (base in symbols):
            symbols[base] += 1
        else:
            symbols[base] = 0
    return symbols

# Test
freq("ATCGTACGTAGCATGCTAGCTAGCTAGCTC")

{'A': 6, 'T': 7, 'C': 7, 'G': 6}

In [25]:
#Using lambda notation to sort a dictionary with AA frquencies
def sortDict(dic):
    return sorted(dic.items(), key = lambda x: x[1], reverse = True) # x[1] accesses the value, x[0] the key

for (k, v) in sortDict(freq("ATCGTACGTAGCATGCTAGCTAGCTAGCTC")):
    print(k , ":" , v)

T : 7
C : 7
A : 6
G : 6


In [51]:
#Returns the percentage of G and C nucleotides in a DNA sequence
#Genes are tipically found in GC-rich regions of the genome
def gcPercent(dna_seq):
    gc_count = 0
    for base in dna_seq.upper():
        if base is "G" or base is "C":
            gc_count += 1
            
    return gc_count / len(dna_seq)

#Test
gcPercent("GCGCTATGCTAGCGCGCGC")

0.7368421052631579

In [64]:
# Parts the string in substring of size k
def partString(string, k):
    res = []
    curr_k = 0
    
    for i in range(0, len(string), k):
        res.append(string[i: i + k])
        
    return res

def gcPercentSubseq(dna_seq, k):
    return list(map(lambda x : gcPercent(x), partString(dna_seq.upper(), k)))
    
#Test
gcPercentSubseq("GCGTAGCTAGCTGCGCGCGCGCTAGCTACGGCATGCTCGCGCGCGCGATCGATC", 10)

[0.6, 0.9, 0.6, 0.7, 0.8, 0.5]

In [67]:
# Function that computes the RNA corresponding to the transcription of the DNA sequence provided
def transcription(dna_seq):
    assert isValid(dna_seq)
    return dna_seq.upper().replace("T", "U")

#Test
print(transcription("ATGCT"))

AUGCU


In [80]:
def complement(base):
    switcher = {"A": "T",
                "T": "A",
                "G": "C",
                "C": "G"}
    return switcher[base]

# Reverse Complement of a DNA molecule
# We reverse the chain because that is how it is read (5' -> 3')
def dnaComplement(dna_seq):
    assert isValid(dna_seq)
    res = []
    
    for base in reversed(dna_seq.upper()):
        res.append(complement(base))
    
    return res

#Test
dnaComplement("ACGTACGTACGTAGCATGCTAGC")

['G',
 'C',
 'T',
 'A',
 'G',
 'C',
 'A',
 'T',
 'G',
 'C',
 'T',
 'A',
 'C',
 'G',
 'T',
 'A',
 'C',
 'G',
 'T',
 'A',
 'C',
 'G',
 'T']

In [91]:
#Ler ficheiro 'genetic_code.txt' e guardar num dicionario
def readFile(fileName):
    return open(fileName, "r")

def readGeneticCode(fileName):
    dic = {}
    for line in readFile(fileName):
        dic[line[0:4]] = line[7]
    return dic

#Test
readGeneticCode("files/genetic_code.txt")

{'"GCT': 'A',
 '"GCC': 'A',
 '"GCA': 'A',
 '"GCG': 'A',
 '"TGT': 'C',
 '"TGC': 'C',
 '"GAT': 'D',
 '"GAC': 'D',
 '"GAA': 'E',
 '"GAG': 'E',
 '"TTT': 'F',
 '"TTC': 'F',
 '"GGT': 'G',
 '"GGC': 'G',
 '"GGA': 'G',
 '"GGG': 'G',
 '"CAT': 'H',
 '"CAC': 'H',
 '"ATA': 'I',
 '"ATT': 'I',
 '"ATC': 'I',
 '"AAA': 'K',
 '"AAG': 'K',
 '"TTA': 'L',
 '"TTG': 'L',
 '"CTT': 'L',
 '"CTC': 'L',
 '"CTA': 'L',
 '"CTG': 'L',
 '"ATG': 'M',
 '"AAT': 'N',
 '"AAC': 'N',
 '"CCT': 'P',
 '"CCC': 'P',
 '"CCA': 'P',
 '"CCG': 'P',
 '"CAA': 'Q',
 '"CAG': 'Q',
 '"CGT': 'R',
 '"CGC': 'R',
 '"CGA': 'R',
 '"CGG': 'R',
 '"AGA': 'R',
 '"AGG': 'R',
 '"TCT': 'S',
 '"TCC': 'S',
 '"TCA': 'S',
 '"TCG': 'S',
 '"AGT': 'S',
 '"AGC': 'S',
 '"ACT': 'T',
 '"ACC': 'T',
 '"ACA': 'T',
 '"ACG': 'T',
 '"GTT': 'V',
 '"GTC': 'V',
 '"GTA': 'V',
 '"GTG': 'V',
 '"TGG': 'W',
 '"TAT': 'Y',
 '"TAC': 'Y',
 '"TAA': '_',
 '"TAG': '_',
 '"TGA': '_'}