In [None]:
import numpy as np
import pandas as pd
import itertools as it
import os
import math
import matplotlib.pyplot as plt

In [None]:
# Variables
files = ["mRNA"] #["chr"+str(x) for x in range(1,22)] + ["chrMT", "chrX", "chrY"]
in_prefix = "input/mRNA/" # "ncbi_dataset/data/GCF_000001405.39/" 
in_ext = ".FNA"
out_prefix = "output/mRNA/"
out_ext = ".txt"
out_name = "out1"

save_plots = True
warnings = False

reading_frame = 0 # 1st (0), 2nd (1), or 3rd (2) reading frame, i.e. offset

In [None]:
# Define character sets
othchars = {'N', '\n'} # Expected characters that should be ignored
nucchars = {'T', 'A', 'C', 'G'} # Characters that should be read as part of the RNA # , 'B', 'K', 'M', 'S', 'R', 'W', 'Y'
corenucchars = ['T', 'A', 'C', 'G'] # Subset of characters of RNA that are valid
cdnchars = [] # All triplets of nucchars
for c in it.product(nucchars, repeat=3):
    codon = c[0]+c[1]+c[2]
    cdnchars.append(codon)
corecdnchars = [] # All triplets of corenucchars
for c in it.product(corenucchars, repeat=3):
    codon = c[0]+c[1]+c[2]
    corecdnchars.append(codon)

In [None]:
# Codon dictionary
codondict = {
    "TTT": "Phenyl-alanine",
    "TTC": "Phenyl-alanine",
    "TTA": "Leucine",
    "TTG": "Leucine",
    "TCT": "Serine",
    "TCC": "Serine",
    "TCA": "Serine",
    "TCG": "Serine",
    "TAT": "Tyrosine",
    "TAC": "Tyrosine",
    "TAA": "Stop",
    "TAG": "Stop",
    "TGT": "Cysteine",
    "TGC": "Cysteine",
    "TGA": "Stop",
    "TGG": "Tryptophan",
    "CTT": "Leucine",
    "CTC": "Leucine",
    "CTA": "Leucine",
    "CTG": "Leucine",
    "CCT": "Proline",
    "CCC": "Proline",
    "CCA": "Proline",
    "CCG": "Proline",
    "CAT": "Histidine",
    "CAC": "Histidine",
    "CAA": "Glutamine",
    "CAG": "Glutamine",
    "CGT": "Arginine",
    "CGC": "Arginine",
    "CGA": "Arginine",
    "CGG": "Arginine",
    "ATT": "Isoleucine",
    "ATC": "Isoleucine",
    "ATA": "Isoleucine",
    "ATG": "Methionine",
    "ACT": "Threonine",
    "ACC": "Threonine",
    "ACA": "Threonine",
    "ACG": "Threonine",
    "AAT": "Asparagine",
    "AAC": "Asparagine",
    "AAA": "Lysine",
    "AAG": "Lysine",
    "AGT": "Serine",
    "AGC": "Serine",
    "AGA": "Arginine",
    "AGG": "Arginine",
    "GTT": "Valine",
    "GTC": "Valine",
    "GTA": "Valine",
    "GTG": "Valine",
    "GCT": "Alanine",
    "GCC": "Alanine",
    "GCA": "Alanine",
    "GCG": "Alanine",
    "GAT": "Aspartic Acid",
    "GAC": "Aspartic Acid",
    "GAA": "Glutamic Acid",
    "GAG": "Glutamic Acid",
    "GGT": "Glycine",
    "GGC": "Glycine",
    "GGA": "Glycine",
    "GGG": "Glycine",
}
aminonames = set()
for c in corecdnchars:
    if not codondict[c] in aminonames:
        aminonames.add(codondict[c])

print(aminonames)

In [None]:
def make_dict(S):
    d = { }
    for s in S:
        if not s in d:
            d[s] = 0
    return d

In [None]:
def count_letters(strlist, nucchrst, cdnchrst, othchrst):
    # create dictionaries for storing the counts
    ds = make_dict(nucchrst)
    dt = [make_dict(nucchrst), make_dict(nucchrst), make_dict(nucchrst)]
    dc = make_dict(cdnchrst)
    
    # for progress messages
    linesdone = 0
    totlines = len(strlist)
    
    # counters for current codon
    cnt = 0
    codon = ""
    
    # reading frame counters
    to_skip_start = reading_frame
    to_skip_end = 2 - reading_frame
    
    for s in strlist:
        for c in s:            
            if c == ">": # Lines starting with ">" are comments, so we skip the line and reset the codon information
                if not (cnt == 0 or cnt == to_skip_end):
                    if warnings: print("Warning: comment read before codon end, skipping", cnt, "characters")
                                
                cnt = 0
                codon =""
                to_skip_start = reading_frame
                break
                            
            if not c in nucchrst: # Only consider characters in the set
                if not c in othchrst: print("Read unexpected character", c) # report unexpected characters
                continue
                
            if to_skip_start > 0:
                to_skip_start -= 1
                continue
            
            ds[c] += 1 # increment total count
            dt[cnt][c] += 1 # increment positional count
            
            # update current codon
            cnt += 1
            codon += c
            if cnt == 3:
                dc[codon] += 1
                cnt = 0
                codon = ""
        
        # progress messages
        linesdone += 1
        if linesdone%(math.floor(totlines/10) + 1) == 0:
            print("Completed", round(100*linesdone/totlines, 2), "percent")
            
    print("Completed")
    
    return ds, dt, dc

In [None]:
def count_letter_threeframe(strlist, nucchrst, cdnchrst, othchrst):
    # create dictionaries for storing the counts
    dicts = [(make_dict(nucchrst),
              [make_dict(nucchrst) for x in range(3)],
              make_dict(cdnchrst)
             ) for x in range(3)]
    
    # for progress messages
    linesdone = 0
    totlines = len(strlist)
    
    # counters for current codon
    cnt = [0 for x in range(3)]
    codon = ["" for x in range(3)]
    
    # reading frame counters
    to_skip_start = [x for x in range(3)]
    to_skip_end = [2-x for x in range(3)]
    
    for s in strlist:
        for c in s:            
            if c == ">": # Lines starting with ">" are comments, so we skip the line and reset the codon information
                if warnings:
                    wrn = ""
                    for i in range(3):
                        if not (cnt[i] == 0 or cnt[i] == to_skip_end[i]):
                            wrn += "[Skipping "+str(cnt[i])+" characters in frame "+str(i)+"]"
                    if not wrn == "":
                        print("Warning: comment read before codon end skipping ", wrn)
                                
                cnt = [0 for x in range(3)]
                codon = ["" for x in range(3)]
                to_skip_start = [x for x in range(3)]
                break
                            
            if not c in nucchrst: # Only consider characters in the set
                if not c in othchrst: print("Read unexpected character", c) # report unexpected characters
                continue
                
            for i in range(3):
                if to_skip_start[i] > 0:
                    to_skip_start[i] -= 1
                    continue
                
                dicts[i][0][c] += 1
                dicts[i][1][cnt[i]][c] += 1
                
                # update current codon
                cnt[i] += 1
                codon[i] += c
                if cnt[i] == 3:
                    dicts[i][2][codon[i]] += 1
                    cnt[i] = 0
                    codon[i] = ""
        
        # progress messages
        linesdone += 1
        if linesdone%(math.floor(totlines/10) + 1) == 0:
            print("Completed", round(100*linesdone/totlines, 2), "percent")
            
    print("Completed")
    
    return dicts

In [None]:
def replace_TU (s):
    res = ""
    for c in s:
        if not c == 'T':
            res += c
        else:
            res += 'U'
    return res

In [None]:
def make_bar_nucleo(S, dsum, dtup, title, sort=False, save=False):
    X = list(S)
    if sort: X.sort(reverse=True, key=lambda x : d[x])
    Y = [[dsum[x] for x in X], [dtup[0][x] for x in X], [dtup[1][x] for x in X], [dtup[2][x] for x in X]]
    
    fig, ax = plt.subplots()
    width=0.2
    x = np.arange(len(X))
    
    r1 = ax.bar(x - 3*width/2, Y[0], width, label='Total')
    r1 = ax.bar(x - width/2, Y[1], width, label='1st')
    r2 = ax.bar(x + width/2, Y[2], width, label='2nd')
    r2 = ax.bar(x + 3*width/2, Y[3], width, label='3rd')
    ax.set_xticks(x)
    ax.set_xticklabels(map(replace_TU, X))
    ax.legend()
    
    plt.title(title)
    plt.xlabel('Nucleotide')
    plt.ylabel('Frequency')
    if not save: plt.show()
    if save: plt.savefig(out_prefix+title+".pdf")
    plt.close()

In [None]:
def make_bar_codon(S, d, title, sort=False, save=False):
    X = list(S)
    if sort: X.sort(reverse=True, key=lambda x : d[x])
    Y = [d[x] for x in X]
    x = np.arange(len(X))
    
    fig, ax = plt.subplots(figsize =(16, 9))
    ax.bar(X,Y,width=0.5,color=["tab:blue", "tab:orange", "tab:green", "tab:red"])
    ax.set_xticks(x)
    ax.set_xticklabels(map(replace_TU, X))
    plt.title(title)
    plt.xlabel('Codon')
    plt.ylabel('Frequency')
    plt.xticks(rotation=90)
    if not save: plt.show()
    if save: plt.savefig(out_prefix+title+".pdf")
    plt.close()

In [None]:
def make_bar_amino(Sa, Sc, d, title, sort=False, save=False):
    da = make_dict(Sa)
    for c in Sc:
        da[codondict[c]] += d[c]
    
    X = list(Sa)
    if sort: X.sort(reverse=True, key=lambda x : da[x])
    Y = [da[x] for x in X]
    
    fig, ax = plt.subplots(figsize =(16, 9))
    plt.bar(X,Y,width=0.5,color=["tab:blue", "tab:orange", "tab:green", "tab:red"])
    plt.title(title)
    plt.xlabel('Aminoacid')
    plt.ylabel('Frequency')
    plt.xticks(rotation=90)
    if not save: plt.show()
    if save: plt.savefig(out_prefix+title+".pdf")
    plt.close()

In [None]:
def single_frame_out() :
    f = open(out_prefix+out_name+out_ext, "a+")
    f.write("--------------------------------\n")
    f.write("Summary Set Start\n")
    f.write("--------------------------------\n")
    f.close()

    for name in files:
        f = open(in_prefix+name+in_ext, "r")
        lines = f.readlines()
        f.close()
        ds, dt, dc = count_letters(lines, nucchars, cdnchars, othchars)
        print(ds)

        f = open(out_prefix+out_name+out_ext, "a+")
        f.write("File "+name+"\n")
        f.write(str(ds)+"\n")
        f.write(str(dt[0])+"\n")
        f.write(str(dt[1])+"\n")
        f.write(str(dt[2])+"\n")
        f.write("----------------\n")
        f.close()

        if save_plots:
            make_bar_nucleo(corenucchars, ds, dt, name+"_nucleotides", save=True)
            make_bar_codon(corecdnchars, dc, name+"_codon", save=True)
            make_bar_codon(corecdnchars, dc, name+"_codon_sorted", sort=True, save=True)
            make_bar_amino(aminonames, corecdnchars, dc, name+"_amino", save=True)
            make_bar_amino(aminonames, corecdnchars, dc, name+"_amino_sorted", sort=True, save=True)

        print("Completed", name)

    f = open(out_prefix+out_name+out_ext, "a+")
    f.write("--------------------------------\n")
    f.write("Summary Set End\n")
    f.write("--------------------------------\n\n\n\n")
    f.close()

In [None]:
def all_frame_out() :
    f = open(out_prefix+out_name+out_ext, "a+")
    f.write("--------------------------------\n")
    f.write("Summary Set Start\n")
    f.write("--------------------------------\n")
    f.close()

    for name in files:
        f = open(in_prefix+name+in_ext, "r")
        lines = f.readlines()
        f.close()
        dicts = count_letter_threeframe(lines, nucchars, cdnchars, othchars)
        print(ds)

        f = open(out_prefix+out_name+out_ext, "a+")
        f.write("File "+name+"\n")
        for i in range(3):
            f.write("Frame "+i+"\n")
            f.write(str(dicts[i][0])+"\n")
            f.write(str(dicts[i][1][0])+"\n")
            f.write(str(dicts[i][1][1])+"\n")
            f.write(str(dicts[i][1][2])+"\n")
            f.write(str(dicts[i][2])+"\n")
        f.write("----------------\n")
        f.close()

        if save_plots:
            for i in range(3):
                make_bar_nucleo(corenucchars, dicts[i][0], dicts[i][1], name+"_nucleotides frame "+i, save=True)
                make_bar_codon(corecdnchars, dicts[i][2], name+"_codon frame "+i, save=True)
                make_bar_codon(corecdnchars, dicts[i][2], name+"_codon_sorted frame "+i, sort=True, save=True)
                make_bar_amino(aminonames, corecdnchars, dicts[i][2], name+"_amino frame "+i, save=True)
                make_bar_amino(aminonames, corecdnchars, dicts[i][2], name+"_amino_sorted frame "+i, sort=True, save=True)

        print("Completed", name)

    f = open(out_prefix+out_name+out_ext, "a+")
    f.write("--------------------------------\n")
    f.write("Summary Set End\n")
    f.write("--------------------------------\n\n\n\n")
    f.close()

In [None]:
f = open(in_prefix+"mRNA"+in_ext, "r")
lines = f.readlines()
f.close()
dicts = count_letter_threeframe(lines, nucchars, cdnchars, othchars)

In [None]:
name="mRNA"
make_bar_nucleo(corenucchars, dicts[0][0], dicts[0][1], name+"_nucleotides frame "+str(0), save=False)
make_bar_codon(corecdnchars, dicts[0][2], name+"_codon frame "+str(0), save=False,sort=True)

In [None]:
for i in range(3):
    print(dicts[i][0])
print(ds)

In [None]:
for d in dicts:
    print(d)

In [None]:
f = open(out_prefix+out_name+out_ext, "a+")
f.write("File "+name+"\n")
for i in range(3):
    f.write("Frame "+str(i)+"\n")
    f.write(str(dicts[i][0])+"\n")
    f.write(str(dicts[i][1][0])+"\n")
    f.write(str(dicts[i][1][1])+"\n")
    f.write(str(dicts[i][1][2])+"\n")
    f.write(str(dicts[i][2])+"\n")
f.write("----------------\n")
f.close()

if save_plots:
    for i in range(3):
        make_bar_nucleo(corenucchars, dicts[i][0], dicts[i][1], name+"_nucleotides frame "+str(i), save=True)
        make_bar_codon(corecdnchars, dicts[i][2], name+"_codon frame "+str(i), save=True)
        make_bar_codon(corecdnchars, dicts[i][2], name+"_codon_sorted frame "+str(i), sort=True, save=True)
        make_bar_amino(aminonames, corecdnchars, dicts[i][2], name+"_amino frame "+str(i), save=True)
        make_bar_amino(aminonames, corecdnchars, dicts[i][2], name+"_amino_sorted frame "+str(i), sort=True, save=True)
print("Completed", name)
