In [18]:
def parse_fasta_file(input_file):
    """This parses FASTA files and compiles them into a dictionary 
    with the identity tag as the keys
    and the nucleotide or amino acid sequences as the values
    """

    parsed_seqs = {}

    f = open(input_file)

    curr_seq_id = None
    curr_seq = []

    for line in f:

            line = line.strip()

            if line.startswith(">"):
                if curr_seq_id is not None:
                    parsed_seqs[curr_seq_id] = ''.join(curr_seq)

                curr_seq_id = line[1:]
                curr_seq = []
                continue

            curr_seq.append(line)

    parsed_seqs[curr_seq_id] = ''.join(curr_seq)
    
    print('Parse Complete')

    reference_protein = list(parsed_seqs)[0]
    reference_sequence = list(parsed_seqs.values())[0]
    
    return parsed_seqs

In [19]:
def analyze_nucleotide_abundance(parsed_seqs, species):
    """computes the abundance of every mononucleotide and dinucleotide in a genome
    input_file -- a FASTA file to parse, placing every gene has been placed in a dictionary
    with the id tag as the key and the residue sequence as the value
    """

    genome_string = ""
    #creates an empty string labeled "genome_string"
    
    sequences = parsed_seqs
            
    species_column = []
    nuc_id = []
    nuc_count = []
    nuc_percent = []
    
    for identity_tag, sequence in sequences.items():
        
        genome_string += sequence
        # compiles all residues from an annotated genome into a single string labeled genome_string
    
    genome_string = genome_string.upper()
    # converts all nucleotide letters in genome_string to upper case
    
    total_count = len(genome_string)
    #print("Total Length: ",total_count)

    a_count = genome_string.count("A")
    a_percent = round(a_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('A')
    nuc_count.append(a_count)
    nuc_percent.append(a_percent)
    #print(f"Number of A's: ",a_count,",",a_percent,"%")
    
    t_count = genome_string.count("T")
    t_percent = round(t_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('T')
    nuc_count.append(t_count)
    nuc_percent.append(t_percent)
    #print(f"Number of T's: ",t_count,",",t_percent,"%")
    
    c_count = genome_string.count("C")
    c_percent = round(c_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('C')
    nuc_count.append(c_count)
    nuc_percent.append(c_percent)
    #print(f"Number of C's: ",c_count,",",c_percent,"%")
    
    g_count = genome_string.count("G")
    g_percent = round(g_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('G')
    nuc_count.append(g_count)
    nuc_percent.append(g_percent)
    #print(f"Number of G's: ",g_count,",",g_percent,"%")
    
    aa_count = genome_string.count("AA")
    aa_percent = round(aa_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('AA')
    nuc_count.append(aa_count)
    nuc_percent.append(aa_percent)
    #print(f"Number of AA's: ",aa_count,",",aa_percent,"%")
        
    at_count = genome_string.count("AT")
    at_percent = round(at_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('AT')
    nuc_count.append(at_count)
    nuc_percent.append(at_percent)
    #print(f"Number of AT's: ",at_count,",",at_percent,"%")
        
    ac_count = genome_string.count("AC")
    ac_percent = round(ac_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('AC')
    nuc_count.append(ac_count)
    nuc_percent.append(ac_percent)
    #print(f"Number of AC's: ",ac_count,",",ac_percent,"%")
        
    ag_count = genome_string.count("AG")
    ag_percent = round(ag_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('AG')
    nuc_count.append(ag_count)
    nuc_percent.append(ag_percent)
    #print(f"Number of AG's: ",ag_count,",",ag_percent,"%")
        
    ta_count = genome_string.count("TA")
    ta_percent = round(ta_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('TA')
    nuc_count.append(ta_count)
    nuc_percent.append(ta_percent)
    #print(f"Number of TA's: ",ta_count,",",ta_percent,"%")
        
    tt_count = genome_string.count("TT")
    tt_percent = round(tt_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('TT')
    nuc_count.append(tt_count)
    nuc_percent.append(tt_percent)
    #print(f"Number of TT's: ",tt_count,",",tt_percent,"%")
        
    tc_count = genome_string.count("TC")
    tc_percent = round(tc_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('TC')
    nuc_count.append(tc_count)
    nuc_percent.append(tc_percent)
    #print(f"Number of TC's: ",tc_count,",",tc_percent,"%")
        
    tg_count = genome_string.count("TG")
    tg_percent = round(tg_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('TG')
    nuc_count.append(tg_count)
    nuc_percent.append(tg_percent)
    #print(f"Number of TG's: ",tg_count,",",tg_percent,"%")
            
    ca_count = genome_string.count("CA")
    ca_percent = round(ca_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('CA')
    nuc_count.append(ca_count)
    nuc_percent.append(ca_percent)
    #print(f"Number of CA's: ",ca_count,",",ca_percent,"%")
        
    ct_count = genome_string.count("CT")
    ct_percent = round(ct_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('CT')
    nuc_count.append(ct_count)
    nuc_percent.append(ct_percent)
    #print(f"Number of CT's: ",ct_count,",",ct_percent,"%")
        
    cc_count = genome_string.count("CC")
    cc_percent = round(cc_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('CC')
    nuc_count.append(cc_count)
    nuc_percent.append(cc_percent)
    #print(f"Number of CC's: ",cc_count,",",cc_percent,"%")
        
    cg_count = genome_string.count("CG")
    cg_percent = round(cg_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('CG')
    nuc_count.append(cg_count)
    nuc_percent.append(cg_percent)
    #print(f"Number of CG's: ",cg_count,",",cg_percent,"%")
                
    ga_count = genome_string.count("GA")
    ga_percent = round(ga_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('GA')
    nuc_count.append(ga_count)
    nuc_percent.append(ga_percent)
    #print(f"Number of GA's: ",ga_count,",",ga_percent,"%")
        
    gt_count = genome_string.count("GT")
    gt_percent = round(gt_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('GT')
    nuc_count.append(gt_count)
    nuc_percent.append(gt_percent)
    #print(f"Number of GT's: ",gt_count,",",gt_percent,"%")
    
    gc_count = genome_string.count("GC")
    gc_percent = round(gc_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('GC')
    nuc_count.append(gc_count)
    nuc_percent.append(gc_percent)
    #print(f"Number of GC's: ",gc_count,",",gc_percent,"%")
            
    gg_count = genome_string.count("GG")
    gg_percent = round(gg_count / total_count, 4) * 100
    species_column.append(species)
    nuc_id.append('GG')
    nuc_count.append(gg_count)
    nuc_percent.append(gg_percent)
    #print(f"Number of GG's: ",gg_count,",",gg_percent,"%")
    
    # prints the number and proportion of mononucleotides and dinucleotides
    return species_column, nuc_id, nuc_count, nuc_percent

In [20]:
def analyze_dinucleotide_ratio(parsed_seqs, species):
    """Computes the abundance of CG dinucleotides and GC content in each gene of a genome and the ratio between them
    then places these values into lists, along with the id tag of each gene and the species it comes from
    parsed_seqs -- a parsed FASTA file in which every gene has been placed in a dictionary
    with the id tag as the key and the residue sequence as the value
    species -- binomial name of the species being analyzed in string form, e.g. 'Thelohanellus_kitauei'
    """
    
    sequences = parsed_seqs
    
    species_column = []
    gene_ids = []
    cg_counts = []
    gc_counts = []
    gc_contents = []
    dinuc_ratios = []
    
    # creates empty lists for the five dataframe columns
    
    for identity_tag, sequence in sequences.items():
        
        sequence = sequence.upper()
        
        species_column.append(species)
        gene_ids.append(identity_tag)
        
        # places species name and gene id tags in lists
        
        cg_count = sequence.count("CG")
        cg_counts.append(cg_count)
        
        gc_count = sequence.count("GC")
        gc_counts.append(gc_count)
        
        gc_content = sequence.count("G") + sequence.count("C")
        gc_contents.append(gc_content)
        
        # calculates the CG and GC dinucleotide counts and GC content for each gene and places them in lists
        
        if gc_count == 0:
            cg_to_gc_ratio = 0
        else:
            cg_to_gc_ratio = round(cg_count / gc_count, 4)
            
        dinuc_ratios.append(cg_to_gc_ratio)
        
        # calculates the ratio of CGs to GCs and places them in a list
        # while preventing division by zero errors
    
    return species_column, gene_ids, cg_counts, gc_counts, gc_contents, dinuc_ratios

In [21]:
def create_gene_datatable(input_file, species, directory):
    """creates a pandas datatable out of the lists returned by the analyze_dinucleotide_ratio function
    then saves this datatable as a csv file to a specific directory
    input_file -- a FASTA file to parse, placing every gene has been placed in a dictionary
    with the id tag as the key and the residue sequence as the value
    species -- binomial name of the species being analyzed in string form, e.g. 'Thelohanellus_kitauei'
    directory -- a directory in string form to which the csv data table will be saved, e.g. 'C:Downloads'
    """
    
    import pandas as pd
    import os.path
    
    parsed_seqs = parse_fasta_file(input_file)
    
    # retrieves a dictionary of parsed sequences from the parse_fasta_file function
    
    species_column, gene_ids, cg_counts, gc_counts, gc_contents, dinuc_ratios = analyze_dinucleotide_ratio(parsed_seqs, species)
    
    # retrieves lists of dinucleotide abundances and ratios from the analyze_dinucleotide_ratio function
    
    data = {'Species': species_column,
            'Gene ID': gene_ids,
            'CG Dinucleotide Counts': cg_counts,
            'GC Dinucleotide Counts': gc_counts,
            'GC Content': gc_contents,
            'CG to GC Ratio': dinuc_ratios}
    
    df = pd.DataFrame(data)
    
    # creates a dataframe using each list as a column with data
    
    df.to_csv(os.path.join(f'{directory}',f'{species}_dinucleotide_analysis.csv'))
    
    # writes csv file to specified directory
    
    return df

In [24]:
def create_genome_datatable(input_file, species, directory):
    """creates a pandas datatable out of the lists returned by the analyze_dinucleotide_abundance function
    then saves this datatable as a csv file to a specific directory
    input_file -- a FASTA file to parse, placing every gene has been placed in a dictionary
    with the id tag as the key and the residue sequence as the value
    species -- binomial name of the species being analyzed in string form, e.g. 'Thelohanellus_kitauei'
    directory -- a directory in string form to which the csv data table will be saved, e.g. 'C:Downloads'
    """
    
    import pandas as pd
    import os.path
    
    parsed_seqs = parse_fasta_file(input_file)
    
    # retrieves a dictionary of parsed sequences from the parse_fasta_file function
    
    species_column, nuc_id, nuc_count, nuc_percent = analyze_nucleotide_abundance(parsed_seqs, species)
    
    # retrieves lists of nucleotide proportions from the analyze_nucleotide_abundance function
    
    data = {'Species': species_column,
            'Nucleotides': nuc_id,
            'Nucleotide Counts': nuc_count,
            'Nucleotide Percentages': nuc_percent}
    
    df = pd.DataFrame(data)
    
    # creates a dataframe using each list as a column with data
    
    df.to_csv(os.path.join(f'{directory}',f'{species}_nucleotide_proportions.csv'))
    
    # writes csv file to specified directory
    
    return df