In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [20]:
s = """A   71.03711
C   103.00919
D   115.02694
E   129.04259
F   147.06841
G   57.02146
H   137.05891
I   113.08406
K   128.09496
L   113.08406
M   131.04049
N   114.04293
P   97.05276
Q   128.05858
R   156.10111
S   87.03203
T   101.04768
V   99.06841
W   186.07931
Y   163.06333"""
proteinWeights = {x.split("   ")[0] : float(x.split("   ")[1]) for x in s.split("\n")}

In [21]:
def countCTAG(s):
    d = {'A' : 0,'C' : 0,'G' : 0,'T' : 0}
    for letter in s:
        d[letter] = d[letter] + 1
    return d.values()



In [22]:
def reverseComplement(s):
    d = {'A' : 'T','C' : 'G','G' : 'C','T' : 'A'}
    ans = ''
    for i in range(len(s)):
        ans += d[s[len(s)-i-1]]
    return ans

In [23]:
def mendelian(k,m,n):
    total = k + m + n
    chanceOfaa = (m/total)*((m-1)/(total-1))*0.25
    chanceOfaa += 2*0.5*(m/total)*((n)/(total-1))
    chanceOfaa += (n/total)*((n-1)/(total-1))
    return 1 - chanceOfaa

In [24]:
def motifs(s,t):
    last_idx = 0
    answers = []
    while True:
        idx = s.find(t, last_idx)
        if idx > -1:
            last_idx = idx + 1
            answers.append(idx + 1)
        else:
            return answers

In [25]:
def findPalindromes(s):
    ans = []
    for i in range(len(s)):
        for j in range(4,13):
            if i+j > len(s): break
            substring = s[i:i+j]
            if substring == reverseComplement(substring):
                ans.append([i+1, j])
    return ans

In [26]:
def GCContent(s):
    return (s.count('G') + s.count('C'))/len(s)

In [27]:
def hammingDistance(s1,s2):
    ans = 0
    for i in range(len(s1)):
        if s1[i] != s2[i]: ans += 1
    return ans

In [28]:
def longestSharedSubstring(l):
    l.sort(key=lambda x: len(x))

    substringLength = 2
    s = l[0]
    solution = ''
    for substringLength in range(2, len(s)):
        validLength = True

        for i in range(len(s)):
            if i + substringLength > len(s):
                validLength = False
                break
            substring = s[i:i+substringLength]
            validString = True
            for string in l:
                if string.find(substring) == -1: 
                    validString = False
                    break
            if validString:
                solution = substring
                break
        if not validLength:
            return solution 
    

In [29]:
def permutations(remaining, current):
    if len(remaining) == 0:
        print(current)
        return current
    for number in remaining:
        permutations([x for x in remaining if x != number], current + [number])
    

In [32]:
def proteinWeight(s):
    ans = 0
    for acid in s:
        ans += proteinWeights[acid]
    return ans 
proteinWeight("IKPGDHRFTERKNYKIHQRQPGRDWYWFCQTDMTEWLQKNRGYIRYFCWCEKAITVCCTFVSHGPIMAKWEGLTDRKVDCNSHNCKSSFYPVWEIMVQIKMYHEDIHNFGDECTICQHDMISLQTRTMAANLASQKEDLWELKEGACLFYRFGWRMPTDMAPVFNVTVCEIATEVFCWYIHTTITHEHPPQTGNHYDNYKGNQPSMGSKVKWNWNKYPQWAESQPKQRPANDQCWEQSGAPMPGCKLACHFNPQSRWRRIAWMDTFDMAWLDRVIEYTKIEWGAYPDHKDKYEFYLCVSNCNVMDCKYHAEAMIQFDATDDDWYSCNLPHFCCGSMEFKFHQTGHDPMTTLVRMTNWAFNWRENCSDWFECLLQSATCSLSACDYTWNYLGQASVGEITSDYTHSKYMTWFKLNGIYPIRIEVWGEAQNYHINQEHYSQDSCFLAYDFDLHRERTRVFNEMISTEYPKYDYDLFYPSPCRKQAEALHHIWVEKWPRADDTLFEWEGFVSRVGSRIPRVPTACYCYYSTRIDRKLPTFYMVDIAKPYHVLCERVRGEYKTKAASTSNDYYHGSESTTERTFIDRKMDAFLLSISNCFHQGRQRCRKPRDNVALDAQNTYGSDNKDMCEAPFVSMYIRMNPDHQNWGIEQPFSCGCAFPCALNYFSHNRPTSLLEFVITMYQRLRMMISYHTEAPSPYVHHRMCGTGPNGAVPKKPLTDWWDIRASFHHSADVRIDVMLNVCIGSRPSTGEPLAAPGCWYRNTPERHFCACGDAIANRFHIRDNQGNSRAGYNMWHRMEQGTINVQYASMHFVSFCAIEAVFCWFHDWGQSFMTGFVFYNPELGPAYKVPCSNMFHMMLCLLTDIMLSYNTPTQPWLEWGSRSDYWYDEDLVNDEIRCGEGTMQGTTDRDLLGWLCKEFFKTCCRSWVLTPAPAPRCKTKMTMPFCNPNHKEPWQTTKVSVDANESMTPWFNRWIANVHQVTIGPES")

115878.75272000064

In [10]:
def graphOverlap(dic, k=3): #dic is {"name" : "DNA string", ...}
    n = len(dic)
    strings = list(dic.values())
    names = list(dic.keys())
    print(strings)
    print(names)
    ans = []
    for i in range(n):
        for j in range(n):
            if i != j:
                if strings[i][:k] == strings[j][-k:]:
                    ans.append(names[j] + " " + names[i])
    return ans

graphOverlap({"Rosalind_0498" : "AAATAAA","Rosalind_2391":"AAATTTT","Rosalind_2323":"TTTTCCC","Rosalind_0442":"AAATCCC","Rosalind_5013":"GGGTGGG"})

['AAATAAA', 'AAATTTT', 'TTTTCCC', 'AAATCCC', 'GGGTGGG']
['Rosalind_0498', 'Rosalind_2391', 'Rosalind_2323', 'Rosalind_0442', 'Rosalind_5013']


['Rosalind_0498 Rosalind_2391',
 'Rosalind_2391 Rosalind_2323',
 'Rosalind_0498 Rosalind_0442']

In [None]:
def shortestSuperstring(dic):
    k = min([len(x) for x in list(dic.values())])
    edges = graphOverlap(dic, k)
    

In [21]:
with open('inpt.txt') as f:
    names = []
    l = []
    for i, line in enumerate(f.readlines()):
        if line[0] == '>':
            names.append(line.strip()[1:])
            if i!= 0: l.append(s)
            s = ''
        else:
            s += line.strip()
    l.append(s)
    dic = {names[x] : l[x] for x in range(len(l))}
    print(dic)
    
    with open('outpt.txt', 'w') as o:
        ans = graphOverlap(dic)
        for s in ans:
            o.write(s + "\n")
        

{'Rosalind_6900': 'TAACGATAGCCGCTAGGCATGTAGCCTGCCTACGAGATTTTGCCACTTGTGGATAAAGAGGGCTTCGTAAACGTTTCTGGCTGAC', 'Rosalind_2572': 'GGCAGGTTGCAAAACGTCTTCAATCGAGTCCAACAAAAGCTGATCTCGCCCTGGCGGGTTATCTATCAGCGTGTGAAGACGGTTCAACGC', 'Rosalind_6414': 'GCATATAATTTAAAATAAGCCTTGTATAACGGCTTGGCTTATCGTGCCGCCCCTTGCGATTAGCTTGTTGGTTTTCCGAGCAGATATCGCTCGTACTCCA', 'Rosalind_8587': 'TAATGTGGTGTGATCTCCGTAATCGATGAAGCGCTGTTATCCAACTCGTAATCTGTTCCACGAGATTCTGTAGCGTGTTATCCGGACGTTACCTGC', 'Rosalind_4151': 'ACCTACAAGATCTACCGCAAAAAGGCGTTTACTGGGGCAGCCCTTTCTAGGGACTAGTTGATATACGCCCGCGATGGGCAATA', 'Rosalind_6114': 'CCTATGACATTGGACATACCGGCAGTTTCCTACCTTACTCGCGTTGGTAACACACGTGATTCCAAAACTCGAGAAATGCCAGTGCATATGAAGGCCCCG', 'Rosalind_1642': 'AGAGAGTTTCGGCCATTGCTGAAAACAAAGACCGCAGTAGTCACCGGGGCGGCCCAACCGGCCGACCTAACTGTCCTCAAGCACATAC', 'Rosalind_2095': 'GCGCGACGCCGGATTAACCTTGTTGGCACATGGTCGCGATTCGTTGGTCAGGCTAGATGACGAAGCCAATTGTCAGGGACCT', 'Rosalind_8151': 'TCAAATCGGGCCACGTTTTTACTCCTTCGACGCGCGAAAATAACCCTCAGTGTTTACCGGCTGGCCCGTACCACCGCCACGAT', 'Ros