# DNA To Music
#### Author: Frank Escalante
#### Updated: 08/10/2024

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as img
import cv2
import random
import os

# Functions

In [32]:
def DNA_nmbrs(bases, seq_length):
    nucleotide = ''
    base_combos = []
    count = 0
    while len(base_combos) != len(bases)**seq_length:
        while len(nucleotide) < seq_length and len(nucleotide) != seq_length:
            base = random.choice(bases)
            nucleotide += base
        if nucleotide not in base_combos:
            base_combos.append((count, nucleotide))
            nucleotide = ''
            count += 1
        if nucleotide in base_combos:
            nucleotide = ''
    return len(base_combos), base_combos
                
        
def fib_seq(N,seq=[]):
    if N == 0:
        seq = [0]
    elif N == 1:
        seq = [0,1]
    elif N > 1:
        seq = [0,1]
        for i in range(2,N):
            next = seq[len(seq)-1] + seq[len(seq)-2]
            seq.append(next)
    return seq
   
        
def mkDNA(length, alphabet):
    if alphabet == 'STD':
        bits = 'ATGC'
    else:
        bits = alphabet
        
    max_numb = len(bits)
    lst = []
    for i in range(0,max_numb):
        lst.append(i)
    
    DNA = ''
    for i in range(0,length):
        select = random.choice(lst)
        base = bits[select]
        DNA += base
    
    return DNA

def music_notes(alphabet):
    if alphabet == 'STD':
        alphabet = ['C','C#','D','D#','E','F','F#','G','G#','A','A#','B']
    if alphabet == 'Expanded':
        alphabet = ['R','C','C#','D','D#','E','E#','F','F#','G','G#','A','A#','B','B#','X']
    music = []
    for i in range(0, len(alphabet)):
        music.append((i,alphabet[i]))
    return music

def quartenary(nmbr):
    nmbr_dict = {0: "A", 1: "C", 2: "G", 3: "T"}
    memory = []
    nucleotides = []
    if nmbr <= 3:
            remainder = nmbr % 4
            memory.append(0)
            memory.append(0)
            memory.append(remainder)
    else:
        fours = nmbr // 4
        if fours <= 3:
            memory.append(0)
            memory.append(fours)
            memory.append(nmbr % 4)
        elif fours > 3:
            sixteens = nmbr // 16
            memory.append(sixteens)
            remainder = nmbr % 16
            if remainder == 0:
                memory.append(0)
                memory.append(0)
            elif remainder > 0:
                memory.append(round(remainder // 4))
                memory.append(remainder % 4)
    
    for i in memory:
        nucleotide = nmbr_dict[i]
        nucleotides.append(nucleotide)
    
    codon = ''
    for i in nucleotides:
        codon += i
        
    return codon, memory

def mltplr(numbr, points, multiplier):
    modulus = len(points)
    original = numbr % modulus
    new = (numbr * multiplier) % modulus
    if original == 0:
        original = modulus
    if new == 0:
        new = modulus
    for i in points:
        if i[0] == original:
            index1 = i[0]
            DNA_codon1 = i[1]
            music_note1 = i[2]
        if i[0] == new:
            index2 = i[0]
            DNA_codon2 = i[1]
            music_note2 = i[2]
    seq = ''
    for i in points:
        if i[0] == abs(index2 - index1) % modulus:
            seq = i[1]
    if seq == '':
        seq = points[modulus-1][1]
    return (seq,(music_note1,music_note2))   
    
def DNAmusic(seq, points):
    count = 0
    codon = ''
    number_seq = []
    music_seq = []
    for i in range(0,len(seq)):
        if i % 3 != 0:
            codon += seq[i]
            count += 1
        if i % 3 == 0:
            codon += seq[i]
            count = 0
            for entry in points:
                trans_seq = entry[1]
                if trans_seq == codon:
                    number_seq.append(entry[0])
                    music_seq.append(entry[2])
            codon = ''
    return number_seq, music_seq

def music_nlyzr(music,nmbr_seq):
    unique_notes = list(set(music))
    unique_nmbrs = list(set(nmbr_seq))
    notes = {}
    nmbrs = {}
    for i in music:
        if i != list(notes.keys()):
            notes[i] = music.count(i)
    for i in nmbr_seq:
        if i != list(nmbrs.keys()):
            nmbrs[i] = nmbr_seq.count(i)
    return nmbrs, notes 

def synDNA_mltplr(factor,points,input_seq):
    out_seq = [] 
    syn_DNA = ''
    music = ''
    for nmbr in input_seq:
        out_seq.append(factor*nmbr)
    for nmbr in out_seq:
        for point in points:
            if point[0] == nmbr % len(points) or nmbr % len(points) == 0:
                codon = point[1]
                note = point[2]
                syn_DNA += codon
                music += note + ','
                break
    music = music[:-1]
    return [music], [syn_DNA]

### Test Run

In [3]:
codons = []
for i in range(0,64):
    codons.append((i,quartenary(i)[0]))

notes = music_notes('Expanded')

cdn_nmbr_nte = []
count = 0
countr = 0
while count != len(codons):
    code = (count+1, codons[count][1], notes[countr][1])
    cdn_nmbr_nte.append(code)
    count += 1
    countr += 1
    if countr == 16:
         countr = 0
            
notes = []
DNAseq = ''
for i in range(0,64):
    output = mltplr(i,cdn_nmbr_nte,38)
    DNAseq += output[0]

    
sny_DNA_2x = synDNA_mltplr(2,cdn_nmbr_nte,[i for i in range(1,65)])

GFP = 'ATGCGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGA\
TGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACAAACGGAAAACTTACCCTTAAATTTATT\
TGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGACTTATGGTGTTCAATGC\
TTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTAC\
AGGAAAGAACTATATCTTTCAAAGATGACGGGACCTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGA\
TACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAA\
TTGGAATACAACTTTAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTA\
ACTTCAAAATTAGACACAACGTTGAAGATGGAAGCGTTCAACTAGCAGACCATTATCAACAAAATACTCC\
AATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCACACAATCTGCCCTTTCGAAAGAT\
CCCAACGAAAAGAGAGACCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGG\
ATGAACTATACAAAAGGCCTGCAGCAAACGACGAAAACTACGCTGCATCAGTT' 

RFP = 'ATGGTGAGCAAGGGCGAGGAGGATAACATGGCCATCATCAAGGAGTTCATGCGCTTCAAGGTGCACATGG\
AGGGCTCCGTGAACGGCCACGAGTTCGAGATCGAGGGCGAGGGCGAGGGCCGCCCCTACGAGGGCACCCA\
GACCGCCAAGCTGAAGGTGACCAAGGGTGGCCCCCTGCCCTTCGCCTGGGACATCCTGTCCCCTCAGTTC\
ATGTACGGCTCCAAGGCCTACGTGAAGCACCCCGCCGACATCCCCGACTACTTGAAGCTGTCCTTCCCCG\
AGGGCTTCAAGTGGGAGCGCGTGATGAACTTCGAGGACGGCGGCGTGGTGACCGTGACCCAGGACTCCTC\
CCTGCAGGACGGCGAGTTCATCTACAAGGTGAAGCTGCGCGGCACCAACTTCCCCTCCGACGGCCCCGTA\
ATGCAGAAGAAGACCATGGGCTGGGAGGCCTCCTCCGAGCGGATGTACCCCGAGGACGGCGCCCTGAAGG\
GCGAGATCAAGCAGAGGCTGAAGCTGAAGGACGGCGGCCACTACGACGCTGAGGTCAAGACCACCTACAA\
GGCCAAGAAGCCCGTGCAGCTGCCCGGCGCCTACAACGTCAACATCAAGTTGGACATCACCTCCCACAAC\
GAGGACTACACCATCGTGGAACAGTACGAACGCGCCGAGGGCCGCCACTCCACCGGCGGCATGGACGAGC\
TGTACAAGTAA'

music_trans1 = DNAmusic(GFP,cdn_nmbr_nte)
GFP_analysis = music_nlyzr(music_trans1[1],music_trans1[0])

music_trans2 = DNAmusic(RFP,cdn_nmbr_nte)
RFP_analysis = music_nlyzr(music_trans2[1],music_trans2[0])


In [41]:
print("The gene for GFP is approximately: " + str(len(GFP)) +" nucleotides long\
 or: " + str(len(GFP)/3) + " Amino Acids Long")
print("The gene for RFP is approximately: " + str(len(RFP)) +" nucleotides long\
 or: " + str(len(RFP)/3) + " Amino Acids Long")

# Generate a random nucleotide sequence 168 kbp long
syn_DNA = mkDNA(16800,'STD')

music_trans_synDNA = DNAmusic(syn_DNA,cdn_nmbr_nte)
synDNA_analysis = music_nlyzr(music_trans_synDNA[1],music_trans_synDNA[0])
#print(music_trans_synDNA)
#print(synDNA_analysis)

Fibonacci_seq_DNA = synDNA_mltplr(2,cdn_nmbr_nte,fib_seq(64)[1:])

The gene for GFP is approximately: 753 nucleotides long or: 251.0 Amino Acids Long
The gene for RFP is approximately: 711 nucleotides long or: 237.0 Amino Acids Long


(['C,C,D,E,G,X,G,G,D,B,C,X,C,C,D,E,G,X,G,G,D,B,C,R,C,C,D,E,G,X,G,G,D,B,C,X,C,C,D,E,G,X,G,G,D,B,C,R,C,C,D,E,G,X,G,G,D,B,C,X,C,C,D'],
 ['AACAACAATACCAGCATTCGCGGCAATGTCTACCTTCACTACAATTCCTGCGTTGGCCGCAATCTCGACAAAGACGACAATGCCGGCATTTGCAGCAATATCCACCTTTACCACAATCCCCGCGTTAGCTGCAATTTCAACAAAAACAACAATACCAGCATTCGCGGCAATGTCTACCTTCACTACAAT'])

### Works Cited
1) GFP Sequence

“Aequorea Victoria Isolate SGFP-206 Green Fluorescent Protein (GFP) 
          Gen - Nucleotide - NCBI.” National Center for Biotechnology 
          Information, U.S. National Library of Medicine, 
          www.ncbi.nlm.nih.gov/nuccore/JX472995.1?report=fasta. 
          Accessed 10 Aug. 2024. 
 
 
2) RFP Sequence

“Synthetic Construct Monomeric Red Fluorescent Protein Gene, Complete 
          C - Nucleotide - NCBI.” National Center for Biotechnology 
          Information, U.S. National Library of Medicine, 
          www.ncbi.nlm.nih.gov/nuccore/AY678264.1?report=fasta. 
          Accessed 10 Aug. 2024. 