In [61]:
def parse_fasta_file(input_file):
    """This parses FASTA files and compiles them into a dictionary 
    with the identity tag as the keys
    and the nucleotide or amino acid sequences as the values
    """

    parsed_seqs = {}

    f = open(input_file)

    curr_seq_id = None
    curr_seq = []

    for line in f:

            line = line.strip()

            if line.startswith(">"):
                if curr_seq_id is not None:
                    parsed_seqs[curr_seq_id] = ''.join(curr_seq)

                curr_seq_id = line[1:]
                curr_seq = []
                continue

            curr_seq.append(line)

    parsed_seqs[curr_seq_id] = ''.join(curr_seq)
    
    print('Parse Complete')

    reference_protein = list(parsed_seqs)[0]
    reference_sequence = list(parsed_seqs.values())[0]
    
    return parsed_seqs

In [63]:
def analyze_nucleotide_abundance(input_file):
    """computes the abundance of every mononucleotide as well as CG and GC dinucleotides in a genome
    input_file -- a FASTA file to parse, placing every gene has been placed in a dictionary
    with the id tag as the key and the residue sequence as the value
    """

    genome_string = ""
    #creates an empty string labeled "genome_string"
    
    sequences = parse_fasta_file(input_file)
    
    for identity_tag, sequence in sequences.items():
        
        genome_string += sequence
        # compiles all residues from an annotated genome into a single string labeled genome_string
    
    genome_string = genome_string.upper()
    # converts all nucleotide letters in genome_string to upper case
    
    total_count = len(genome_string)
    print("Total Length: ",total_count)

    a_count = genome_string.count("A")
    a_percent = round(a_count / total_count, 4) * 100
    print(f"Number of A's: ",a_count,",",a_percent,"%")
    
    t_count = genome_string.count("T")
    t_percent = round(t_count / total_count, 4) * 100
    print(f"Number of T's: ",t_count,",",t_percent,"%")
    
    c_count = genome_string.count("C")
    c_percent = round(c_count / total_count, 4) * 100
    print(f"Number of C's: ",c_count,",",c_percent,"%")
    
    g_count = genome_string.count("G")
    g_percent = round(g_count / total_count, 4) * 100
    print(f"Number of G's: ",g_count,",",g_percent,"%")
    
    cg_count = genome_string.count("CG")
    cg_percent = round(cg_count / total_count, 4) * 100
    print(f"Number of CG's: ",cg_count,",",cg_percent,"%")
    
    gc_count = genome_string.count("GC")
    gc_percent = round(gc_count / total_count, 4) * 100
    print(f"Number of GC's: ",gc_count,",",gc_percent,"%")
    
    # prints the number and proportion of mononucleotides as well as CG dinucleotides

In [56]:
def analyze_dinucleotide_ratio(parsed_seqs, species):
    """Computes the abundance of CG dinucleotides and GC content in each gene of a genome and the ratio between them
    then places these values into lists, along with the id tag of each gene and the species it comes from
    parsed_seqs -- a parsed FASTA file in which every gene has been placed in a dictionary
    with the id tag as the key and the residue sequence as the value
    species -- binomial name of the species being analyzed in string form, e.g. 'Thelohanellus_kitauei'
    """
    
    sequences = parsed_seqs
    
    species_column = []
    gene_ids = []
    cg_counts = []
    gc_counts = []
    dinuc_ratios = []
    
    # creates empty lists for the five dataframe columns
    
    for identity_tag, sequence in sequences.items():
        
        sequence = sequence.upper()
        
        species_column.append(species)
        gene_ids.append(identity_tag)
        
        # places species name and gene id tags in lists
        
        cg_count = sequence.count("CG")
        cg_counts.append(cg_count)
        
        gc_count = sequence.count("G") + sequence.count("C")
        gc_counts.append(gc_count)
        
        # calculates the CG dinucleotide count and GC content for each gene and places them in lists
        
        if gc_count == 0:
            cg_to_gc_ratio = 0
        else:
            cg_to_gc_ratio = (cg_count / gc_count)
            
        dinuc_ratios.append(cg_to_gc_ratio)
        
        # calculates the ratio of CGs to GCs and places them in a list
        # while preventing division by zero errors
    
    return species_column, gene_ids, cg_counts, gc_counts, dinuc_ratios

In [57]:
def create_gene_datatable(input_file, species, directory):
    """creates a pandas datatable out of the lists returned by the analyze_dinucleotide_ratio function
    then saves this datatable as a csv file to a specific directory
    input_file -- a FASTA file to parse, placing every gene has been placed in a dictionary
    with the id tag as the key and the residue sequence as the value
    species -- binomial name of the species being analyzed in string form, e.g. 'Thelohanellus_kitauei'
    directory -- a directory in string form to which the csv data table will be saved, e.g. 'C:Downloads'
    """
    
    import pandas as pd
    import os.path
    
    parsed_seqs = parse_fasta_file(input_file)
    
    # retrieves a dictionary of parsed sequences from the parse_fasta_file function
    
    species_column, gene_ids, cg_counts, gc_counts, dinuc_ratios = analyze_dinucleotide_ratio(parsed_seqs, species)
    
    # retrieves lists of dinucleotide abundances and ratios from the analyze_dinucleotide_ratio function
    
    data = {'Species': species_column,
           'Gene ID': gene_ids,
           'CG Dinucleotide Content': cg_counts,
           'GC Content': gc_counts,
           'CG to GC Ratio': dinuc_ratios}
    
    df = pd.DataFrame(data)
    
    # creates a dataframe using each list as a column with data
    
    df.to_csv(os.path.join(f'{directory}',f'{species}_dinucleotide_analysis.csv'))
    
    # writes csv file to specified directory
    
    return df