# Rosalind - Bioinformatics Stronghold

In [1]:
from itertools import product

import numpy as np
import pandas as pd

import requests

In [2]:
def open_FASTA_file(file_directory):
    file = open(file_directory)
    file_items = {}
    item = ""
    for line in file:
        if line[0] == ">":
            ID = line.strip("\n")
            file_items[ID] = ""
        else:
            file_items[ID] +=line.strip("\n")
        
    return file_items

## LCSM - Finding a Shared Motif

In [3]:
motifs = open_FASTA_file("sample")
motifs = list(motifs.values())

def shortest_motif(motifs):
    shortest_motif_length = min(list(map(len, motifs)))
    for motif in motifs:
        if len(motif) == shortest_motif_length:
            return motif       

def create_frames(reference, size):
    frames = []
    for frame in range(len(reference)):
        if frame + size <= len(reference):
            frames.append(reference[frame:frame+size])
    frames = list(set(frames))

    return frames

def find_longest_common_substring(motifs, size):
    if size > 0:
        reference_motif = shortest_motif(motifs=motifs)
        frames = create_frames(reference_motif, size)

        for frame in frames:
            is_substring = []
            for motif in motifs:
                is_substring.append(motif.find(frame))
            
            if -1 not in is_substring:
                return frame
        
        return find_longest_common_substring(motifs, size-1)
    else:
        return -1

#size = len(shortest_motif(motifs))
#print(find_longest_common_substring(motifs, size=size))


FileNotFoundError: [Errno 2] No such file or directory: 'sample'

## REVP - Locating Restriction Sites

In [None]:
def find_complimentary_DNA(template):
    forward = "ATGC"
    reverse = "TACG"
    table = template.maketrans(forward, reverse)

    return template.translate(table)

def is_palindrome(sequence):
    return sequence[::-1] == find_complimentary_DNA(sequence)

def find_palindromes(size):
    products = list(product("ATGC", repeat=size))
    products = ["".join(item) for item in products]
    palindromes = [DNA for DNA in products if DNA[::-1] == find_complimentary_DNA(DNA)]
    
    return palindromes

def create_palindromes (start_size, end_size):
    palindromes = []
    for size in range(start_size, end_size+1, 2):
        palindromes += find_palindromes(size)

    return palindromes

def save_palindromes_to_txt(palindromes):
    palindromes = ",".join(palindromes)
    file = open("palindromes.txt", "w")
    file.write(palindromes)
    file.close()

#save_palindromes_to_txt(create_palindromes(4, 12))

def load_palindromes(file_name):
    file = open(file_name + ".txt")
    file = file.read()
    palindromes = file.split(",")

    return palindromes

def find_restriction_sites(sequence, restriction_sites):
    for site in restriction_sites:
        location = sequence.find(site)
        while location != -1:
            print(location+1, len(site))
            location = sequence.find(site, location+1)

#palindromes = load_palindromes("palindromes")
#sequence = open_FASTA_file("rosalind_revp")
#sequence = list(sequence.values())[0]
#find_restriction_sites(sequence=sequence, restriction_sites=palindromes)




365 4
502 4
839 4
275 4
663 4
701 4
911 4
152 4
796 4
177 4
230 4
833 4
872 4
700 4
93 4
504 4
252 4
294 4
456 4
257 4
378 4
451 4
595 4
670 4
846 4
26 4
147 4
482 4
627 4
235 4
370 4
437 4
444 4
591 4
756 4
866 4
105 4
527 4
737 4
764 4
1 4
164 4
357 4
409 4
411 4
645 4
827 4
154 4
215 4
583 4
57 4
358 4
410 4
515 4
644 4
767 4
455 6
669 6
436 6
104 6
845 6
626 6
755 6
251 6
293 6
763 6
409 6
871 6
454 8
668 8
435 8
453 10
667 10
452 12


## PRTM - Calculating Protein Mass

In [None]:
mass_table_file = r"C:\Users\vente\OneDrive\Documents\Code\bioinformatics_challenges\databases\monoisotopic_mass_table_amino_acids.txt"
mass_table_file = list(map(lambda x: x.split(), open(mass_table_file).read().split("\n")))
mass_table = {}
for x in mass_table_file: mass_table[x[0]] = x[1]

amino_acid_file = r"C:\Users\vente\Downloads\rosalind_prtm.txt"
amino_acids = open(amino_acid_file).read().strip()

protein_mass = 0
for x in amino_acids:
    protein_mass+=float(mass_table[x])
protein_mass

103707.11785000052

## SPLC - RNA Splicing

In [60]:
file_directory = r"C:\Users\vente\Downloads\rosalind_splc.txt"

sequences = list(open_FASTA_file(file_directory).values())
DNA = sequences[0]
introns = sequences[1:]

for intron in introns:
    DNA = DNA.replace(intron, "")

codons = []
for base in range(len(DNA)):
    if (base+1) % 3 == 0:
        codons.append(DNA[base+1-3:base+1])


In [61]:
RNA_codons_file = r"C:\Users\vente\OneDrive\Documents\Code\bioinformatics_challenges\databases\RNA_codon_table.txt"
#RNA_codons = [[codon for codon in row.split("  ") if codon] for row in open(RNA_codons_file).read().split("\n")]
RNA_codons = [codon[:3].strip() for codon in open(RNA_codons_file).read().replace("   ", "\n").split("\n") if codon]
DNA_codons = [codon.replace("U", "T") for codon in RNA_codons]
amino_acids = [codon[4:].strip() for codon in open(RNA_codons_file).read().replace("   ", "\n").split("\n") if codon]
DNA_amino_acids = {}

for codon in range(len(DNA_codons)):
    DNA_amino_acids[DNA_codons[codon]] = amino_acids[codon]

protein = "".join([DNA_amino_acids[codon] for codon in codons])
protein[:protein.find("Stop")]

'MFSLRFIISVLLALSGVHRFARPHPEGTSLKLDRSSKSATARYERHRSSFPEDGSGFSDGEQRATLLYLCRYEMASPLQSRQYGTRVHSRVISNLFSNVLEARSRIRLLTCPLGSPTSPAFSYLVCIADQLRVRLVYLAFSGYILVTGAARFLGKIVVVNAKLTQRLVGV'

## TRAN - Transitions and Transversions

In [10]:
file_directory = r"C:\Users\vente\Downloads\rosalind_tran.txt"
file = open_FASTA_file(file_directory)

sequences = list(file.values())
sequence_1 = sequences[0]
sequence_2 = sequences[1]

purines = ["A", "G"]
pyrimidines = ["C", "T"]
transitions = 0
transversions = 0

for base in range(len(sequence_1)):
    if sequence_1[base] != sequence_2[base]:
        if (sequence_1[base] in purines and sequence_2[base] in purines) or (sequence_1[base] in pyrimidines and sequence_2[base] in pyrimidines):
            transitions+=1
        else:
            transversions+=1

print(round(transitions/transversions, 11))

2.09090909091


## SSEQ - Finding a Spliced Motif

In [13]:
file_directory = r"C:\Users\vente\Downloads\rosalind_sseq.txt"
sequences_s, sequences_t = tuple(open_FASTA_file(file_directory).values())
indeces = []
start = 0
for base in range(len(sequences_t)):
    index = sequences_s.find(sequences_t[base], start)
    indeces.append(index+1)
    start=index+1+1

print(" ".join(list(map(str, indeces))))

19 23 29 38 44 47 49 59 67 69 73 76 80 94 98 100 106 110 112 121 123 126 131 139 141 145 149 153 162 166 169 174 177 179 182 191 200 203 205 212 214 218 223 229 236 239 243 246 258 278 281 289 291 299 304 311 321 324 332 340 342 345 359 364 368 380 385 388 400 405


## MPRT - Finding a Protein Motif

In [16]:
file_directory = r"C:\Users\vente\Downloads\rosalind_mprt.txt"
proteins = open(file_directory).read().split()

motif_locations = []
for protein in proteins:
    if "_" in protein:
        accession = protein[:protein.find("_")]
    else:
        accession = protein
    
    fasta = requests.get(f"http://www.uniprot.org/uniprot/{accession}.fasta").text
    sequence = fasta[fasta.find("\n")+1:].replace("\n", "")
    
    indeces = []
    for AA in range(len(sequence)):
        try:
            if sequence[AA] == "N":
                if sequence[AA+1] == "P":
                    continue
                else:
                    allowed = ["S", "T"]
                    if sequence[AA+2] in allowed:
                        if sequence[AA+3] == "P":
                            continue
                        else:
                            indeces.append(str(AA+1))
        except:
            continue
    if indeces:
        print(protein + "\n" + " ".join(indeces))


Q8ER84
33
P01046_KNL1_BOVIN
47 87 168 169 197 204
P39873_RNBR_BOVIN
88
Q14ID0
49
P19835_BAL_HUMAN
207
P23185
71 77
P01588_EPO_HUMAN
51 65 110
O82484
104 108 546 742 765
P43541
129
P04141_CSF2_HUMAN
44 54
P00740_FA9_HUMAN
203 213


## LCSQ - Finding a Shared Spliced Motif

In [16]:
file_directory = r"C:\Users\vente\Downloads\rosalind_lcsq (2).txt"
sequences = list(open_FASTA_file(file_directory).values())
if len(sequences[0]) < len(sequences[1]):
    reference = sequences[0]
    query = sequences[1]
else:
    reference = sequences[1]
    query = sequences[0]

long_common = ""
for base1 in range(len(reference)):
    start = 0
    common = ""
    for base in range(base1, len(reference)):
        index = query.find(reference[base], start)
        if index == -1:
            continue
        else:
            common+=reference[base]
            start = index+1
    if len(common) > len(long_common): long_common = common

print(long_common)

ACGCAACAAACCCGTTATGACCACGCCCGGCCTAGAAGGCTTTCTTAGCACGGTACGCGGAAGCATTTTATCGACGTCTTAGCGTGTCTGTGGGGGGGAGAGTTGCTCAGAGATGTTTGGCAATATTAACATGCCGGAAACCTTTCGTGGGGAATTTGGAAGTCCCCCAAATGAACCATGCCGCGGAACTTTAGGAAGTTGGCAAGATTAAGTAAATGCAAGGCCTTGTCTCGATGCGGGAAACTGCAGATAGATGCGTCCTACCACACGTATCATGCTT
