In [6]:
def parse_fasta_file(path):
    """
    A file in FASTA format (.fas, .fasta) applies the following sample notation to store genetic strings:

    Example:

    >Taxon1
    CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
    TCCCACTAATAATTCTGAGG
    >Taxon2
    CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
    ATATCCATTTGTCAGCAGACACGC
    >Taxon3
    CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
    TGGGAACCTGCGGGCAGTAGGTGGAAT

    Every string in a FASTA file begins with a single-line that contains the symbol '>' 
    along with some labeling information about the string. 
    The word following the '>' symbol is the identifier of the sequence, 
    and the rest of the line is its description (both are optional). 
    There should be no space between the ">" and the first letter of the identifier.

    All subsequent lines contain the string itself; 
    it is recommended that all lines of text are shorter than 80 symbols. 
    The string ends when another line starting with '>' appears, 
    indicating the start of another sequence (or if the end of the file is reached).
    
    
    Args:
        + path (str): path to an input file
        
    Returns:
        + dictionary {str: str}:
            key:   taxon name,
            value: taxon genetic string
    """
    lines = read_lines(path)
    d = {}
    current_taxon = None
    for line in lines:
        # If it is a taxon title line, create an entry in the dictionary and set this taxon to be the current key
        if line.startswith('>'):
            taxon_name = line.strip('>')
            if taxon_name in d:
                raise Exception("Taxon {0} found more than once in a file!".format(taxon_name))
            else:
                d[taxon_name] = ''
                current_taxon = taxon_name
        # If it is a line containing part of a genetic string, add this line to the current taxon's entry (concatenate)
        else:
            d[current_taxon] += line
    return d

In [1]:
def read_lines(path):
    lines = []
    with open(path, "r") as in_file:
        for line in in_file:
            lines.append(line.strip())
    return lines

In [23]:
def count_gc_content(string):
    """
    Calculate the GC content of a genetic string,
    i.e. the percentage of 'G' or 'C' bases.
    """
    string = string.upper()
    n_gc = string.count('G') + string.count('C')
    return n_gc * 100.0 / len(string)

In [9]:
def find_highest_gc_string(taxon_dict):
    keys = taxon_dict.keys()
    highest_gc_taxon = None
    highest_gc = 0
    for key in keys:
        string = taxon_dict[key]
        gc = count_gc_content(string)
        if gc > highest_gc:
            highest_gc_taxon = key
            highest_gc = gc
    return highest_gc_taxon, highest_gc

In [15]:
d = parse_fasta_file("./txt/rosalind_gc.txt")

In [16]:
d

{'Rosalind_0440': 'CTGCAAGTGGCCTCATATTGTGTTCTATATCGACAAAATTCTGGTAGCATGGGTTCACGCGTAAACACGCATTGTAACCAGAGTTTATCACACGCTAGAACAGGAGCATGCCCATAGTGAGTCCGATGTGTATCAGCTCCCCGCAAGTTGCACTAATCGGCCCTCTTTAGATAGTGGCGTCTGGCTTGCGTACCTTGCGCAGCACTCTAGGCAGCGAGTGAAGGCGCGTATTGTCTGTGACTGGCATGAGCCTCGACATCATCTCACACTCAAAATGAGCGGCGCCTAACGGCGGTAAGATTATCGGTAAAAATGGTGTTCTAACCACTTGATATAAATTAATGTCCGATTCGTAAGAACATGATAGAGTATAGGGGTATCGTCAGTGAAGGGGCCGAACGGGACCCACTCCGCCGTGTCTAATCTAGGTCTCTTAGTTAGATGTCATATAGGCAGTACCGTTGATCGAATACCGCCGTGACATAATTACCCCGGTCCGTAAAGCTGACAACTGACTACTCGAATAACCATTTGCTCAACGCATTAGGGAAGGTGCCTGAAAGAGAGACATAGTGGATTGAATTCGCATAGTGTATAGCTAGACAAAAGTTCACGTTTCTTTCGCCAGTAAAAGGCGACCGAGGACGGGTCACCTTAGCGCTGTGCTGAATCGTACCGAGAACTGTGTCGAGGTCGCTGGGATCGCCCGATTATGGACAAAAAGATACACGAAACGGGTTAATGATTCATCAAAATGACGTAGCAGGTTTCGACCCAGGGTAACCTCGTCACGAAAGATGGGGCGTGATCCGCTTGAGTATAGAGCTTAGTTGACAGCCGCTACAGCCAACCGTTACAAAGTAGCAGAGTAAAATCCGAAAGTTATCTACCAGGAGAAACTCATGCGCAGAGGATAATTCACCGCGCTCTTCATGTCGAACGGCTACACCACTACGTTGCTGAGAATTGGCAAATCGTA',

In [24]:
find_highest_gc_string(d)

('Rosalind_1310', 53.874538745387454)