In [13]:
import pandas as pd
import numpy as np
# UNCOMMENT LINE BELOW IF SERVER DOESN'T HAVE BIO 
# !pip install biopython
from Bio import SeqIO
import re
import itertools
# from google_drive_downloader import GoogleDriveDownloader as gdd
import cProfile
from urllib.request import urlretrieve

In [14]:
# data import
# bio_sample_data_gdd_id = '1hqDeH_JgND_PY0DX_sFfUQcAuZXoQbAA'

# gdd.download_file_from_google_drive(file_id=bio_sample_data_gdd_id,
#                                     dest_path='./bio_sampledata.txt',
#                                     unzip=False)
urlretrieve("https://pastebin.com/raw/MZfiRmT6", './bio_sampledata.txt')

!head ./bio_sampledata.txt

>WP_132583182.1 polyhydroxybutyrate depolymerase [Rheinheimera sp. D18]
MRVIKLSALTLLSLFATTLNATPALSDLKLEQSITLSGLSSGAYMAGQYHVAFAEQVDGVAMLASGPVYCAQNSLGLALE
HCFNKDTSAPDILAIKQYIAAQRSAGKLAPLITLKDDKIWIFHGAKDATVQPKLATILYEQYKQWVTPENIVLINDKPFA
HTFPTDRPNLGSCERSEAPYLASCGYDASGSLLQHLLGKVKAKTTSTTGTLLEINQHQLAAAAKDTLAEIGYLYVPVSCA
AGEPCKLHVSFHGCKQNANSVGDAFVTGTDLNNYADTNNLVIFYPQTVASSINPFNPNACWDWWGYTGADYATKTGPQLQ
AVHQLVQALLP
>WP_068238279.1 polyhydroxybutyrate depolymerase [Rheinheimera sp. EpRS3]
MHKLKLSSFTLLSAVTLPLAAQQALPKLQLAEQITLSGLSSGAYMAGQYHLAFAEQVSGVAMLAGGPVYCAQNSLGLALE
HCFNKASSSPDMSAINQYLTAQRTAGKLAPVTALKDDKVWIFHGSKDTTVYPGLAGVLNQQYQQWVDAGNIALITDKAFS
HTFPTDNTALGSCDVSETPFLASCNYDAAGELLKHLLGSVKTKTATSSGQLMPFNQHQLAAAAKDTLAETGYLYIPQSCA


In [15]:
def read_fasta(filename):
  descriptor_line_pattern = re.compile(">([\w|\.]*) ([\w|\s|\:|\(|\)|\-]*) \[(.*)\]")

  with open(filename, 'r') as file:
    records = []

    id = None
    enzyme = None
    species = None
    genetic_data = ''

    for line in file.readlines():
      if line[0] == '>':
        if id is not None:
          records.append([id, enzyme, species, genetic_data])
        match = descriptor_line_pattern.match(line)
        if match is None:
          print('HELP', line)
          return
        id = match.group(1)
        enzyme = match.group(2)
        species = match.group(3)
        genetic_data = ''
      else:
        genetic_data += line.strip()
    records.append([id, enzyme, species, genetic_data])

  return pd.DataFrame(records, columns=['ID', 'Enzyme', 'Species', 'Genes'])

        
data = read_fasta('./bio_sampledata.txt')
data.head()

Unnamed: 0,ID,Enzyme,Species,Genes
0,WP_132583182.1,polyhydroxybutyrate depolymerase,Rheinheimera sp. D18,MRVIKLSALTLLSLFATTLNATPALSDLKLEQSITLSGLSSGAYMA...
1,WP_068238279.1,polyhydroxybutyrate depolymerase,Rheinheimera sp. EpRS3,MHKLKLSSFTLLSAVTLPLAAQQALPKLQLAEQITLSGLSSGAYMA...
2,PKM21193.1,polyhydroxybutyrate depolymerase,Gammaproteobacteria bacterium HGW-Gammaproteob...,MRTFKLFSFTLLSAVTLPLAAQQALPKLQLAEQITLSGLSSGAYMA...
3,WP_215397783.1,polyhydroxybutyrate depolymerase,Rheinheimera sp. GL-53,MPKLKLVSFTLLSVFALPLTAELTLPKLQLAEQITLSGLSSGAYMA...
4,MBJ92526.1,polyhydroxybutyrate depolymerase,Alteromonadaceae bacterium,MRFVTLTVLLLSAPSPFTLANNNSAISTLNLDETITVSGLSSGAYM...


In [16]:
data.iloc[0]['Genes']

'MRVIKLSALTLLSLFATTLNATPALSDLKLEQSITLSGLSSGAYMAGQYHVAFAEQVDGVAMLASGPVYCAQNSLGLALEHCFNKDTSAPDILAIKQYIAAQRSAGKLAPLITLKDDKIWIFHGAKDATVQPKLATILYEQYKQWVTPENIVLINDKPFAHTFPTDRPNLGSCERSEAPYLASCGYDASGSLLQHLLGKVKAKTTSTTGTLLEINQHQLAAAAKDTLAEIGYLYVPVSCAAGEPCKLHVSFHGCKQNANSVGDAFVTGTDLNNYADTNNLVIFYPQTVASSINPFNPNACWDWWGYTGADYATKTGPQLQAVHQLVQALLP'

In [17]:
# load scoring matrix
url = 'https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt'
blosum62 = pd.read_csv(url, skiprows=6, delim_whitespace=True, index_col=0)
blosum62['-'] = blosum62['*']
blosum62.head()

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,S,T,W,Y,V,B,Z,X,*,-
A,4,-1,-2,-2,0,-1,-1,0,-2,-1,...,1,0,-3,-2,0,-2,-1,0,-4,-4
R,-1,5,0,-2,-3,1,0,-2,0,-3,...,-1,-1,-3,-2,-3,-1,0,-1,-4,-4
N,-2,0,6,1,-3,0,0,0,1,-3,...,1,0,-4,-2,-3,3,0,-1,-4,-4
D,-2,-2,1,6,-3,0,2,-1,-1,-3,...,0,-1,-4,-3,-3,4,1,-1,-4,-4
C,0,-3,-3,-3,9,-3,-4,-3,-3,-1,...,-1,-1,-2,-2,-1,-3,-3,-2,-4,-4


In [18]:
def matrix(a, b, gap_cost=4):
    H = np.zeros((len(a) + 1, len(b) + 1), np.int)

    for i in range(1, len(a) + 1):
      for j in range(1, len(b) + 1):
        match = H[i - 1, j - 1] + blosum62.loc[a[i-1], b[j-1]] #Line causing an index error, split above to see which part is caising error
        delete = H[i - 1, j] - gap_cost
        insert = H[i, j - 1] - gap_cost
        H[i, j] = max(match, delete, insert, 0)
    return H

def traceback(H, b, b_='', old_i=0):
    # flip H to get index of **last** occurrence of H.max() with np.argmax()
    H_flip = np.flip(np.flip(H, 0), 1)
    i_, j_ = np.unravel_index(H_flip.argmax(), H_flip.shape)
    i, j = np.subtract(H.shape, (i_ + 1, j_ + 1))  # (i, j) are **last** indexes of H.max()
    if H[i, j] == 0:
        return b_, j
    b_ = b[j - 1] + '-' + b_ if old_i - i > 1 else b[j - 1] + b_
    return traceback(H[0:i, 0:j], b, b_, i)

def smith_waterman(a, b, gap_cost=4):
    # a, b = a.upper(), b.upper()
    H = matrix(a, b, gap_cost)
    b_, pos = traceback(H, b)
    return pos, pos + len(b_)

In [20]:
a = data.iloc[0]['Genes']
b = data.iloc[1]['Genes']
# matrix(a,b)
# print(smith_waterman(a, b))
# print(smith_waterman(b, a))
cProfile.run('smith_waterman(a, b)')
# cProfile.run('matrix(a, b)')



         4172680 function calls (4172390 primitive calls) in 1.655 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      582    0.000    0.000    0.003    0.000 <__array_function__ internals>:2(flip)
      291    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(unravel_index)
        1    0.232    0.232    1.629    1.629 <ipython-input-18-eb751f97d257>:1(matrix)
    291/1    0.004    0.000    0.026    0.026 <ipython-input-18-eb751f97d257>:12(traceback)
        1    0.000    0.000    1.655    1.655 <ipython-input-18-eb751f97d257>:22(smith_waterman)
        1    0.000    0.000    1.655    1.655 <string>:1(<module>)
   109561    0.013    0.000    0.013    0.000 blocks.py:233(internal_values)
   219122    0.037    0.000    0.054    0.000 common.py:317(apply_if_callable)
   109561    0.018    0.000    0.018    0.000 contextlib.py:352(__init__)
   109561    0.009    0.000    0.009    0.000 contextlib.py:355(__ent

In [21]:
def get_counts(a, df):
  counts = [0] * len(a)
  for x in range(df.shape[0]):
    print(x)
    res = smith_waterman(a, df.iloc[x]["Genes"])
    for y in range(res[0], res[1] + 1):
      counts[y] += 1
  return counts

counts = get_counts(data.iloc[0]['Genes'], data.iloc[1:])
print(counts)

0
1
2
3
4
5
6


KeyboardInterrupt: 

In [None]:
smith_waterman(data.iloc[0]["Genes"], data.iloc[14]["Genes"]) #Causing and error, think it might have something to do with blosum62 indexing
#blosun indexing wrong

IndexError: ignored

In [None]:

# for reference in data