In [10]:
# https://www.geeksforgeeks.org/boyer-moore-algorithm-good-suffix-heuristic/?ref=header_search

import numpy as np
from line_profiler import LineProfiler
from pysr import PySRRegressor
import random
import os
import sys
import math
import pickle

### Boyer Moore Algorithm with Good Suffix heuristic to find pattern in given text string

In [11]:
# Python3 program for Boyer Moore Algorithm with 
# Good Suffix heuristic to find pattern in 
# given text string

# preprocessing for strong good suffix rule
def preprocess_strong_suffix(shift, bpos, pat, m):

	# m is the length of pattern
	i = m
	j = m + 1
	bpos[i] = j

	while i > 0:
		
		'''if character at position i-1 is 
		not equivalent to character at j-1, 
		then continue searching to right 
		of the pattern for border '''
		while j <= m and pat[i - 1] != pat[j - 1]:
			
			''' the character preceding the occurrence 
			of t in pattern P is different than the 
			mismatching character in P, we stop skipping
			the occurrences and shift the pattern 
			from i to j '''
			if shift[j] == 0:
				shift[j] = j - i

			# Update the position of next border
			j = bpos[j]
			
		''' p[i-1] matched with p[j-1], border is found. 
		store the beginning position of border '''
		i -= 1
		j -= 1
		bpos[i] = j

# Preprocessing for case 2
def preprocess_case2(shift, bpos, pat, m):
	j = bpos[0]
	for i in range(m + 1):
		
		''' set the border position of the first character 
		of the pattern to all indices in array shift
		having shift[i] = 0 '''
		if shift[i] == 0:
			shift[i] = j
			
		''' suffix becomes shorter than bpos[0], 
		use the position of next widest border
		as value of j '''
		if i == j:
			j = bpos[j]

'''Search for a pattern in given text using 
Boyer Moore algorithm with Good suffix rule '''
def boyer_moore(text, pat): # N (text)+ M (pattern)

	# s is shift of the pattern with respect to text
	s = 0
	m = len(pat)
	n = len(text)

	bpos = [0] * (m + 1)

	# initialize all occurrence of shift to 0
	shift = [0] * (m + 1)

	# do preprocessing
	preprocess_strong_suffix(shift, bpos, pat, m)
	preprocess_case2(shift, bpos, pat, m)

	while s <= n - m:
		j = m - 1
		
		''' Keep reducing index j of pattern while characters of 
			pattern and text are matching at this shift s'''
		while j >= 0 and pat[j] == text[s + j]:
			j -= 1
			
		''' If the pattern is present at the current shift, 
			then index j will become -1 after the above loop '''
		if j < 0:
			s += shift[0]
		else:
			
			'''pat[i] != pat[s+j] so shift the pattern 
			shift[j+1] times '''
			s += shift[j + 1]

In [12]:
def generate_pat(n):
  with open('dna.txt', 'r') as f:
    dna = f.read()
    numero_aleatorio = random.randint(1, len(dna)-100)
    pattern = dna[numero_aleatorio: numero_aleatorio + n]
    return dna, pattern

## Frequency Count Method

In [13]:
X_y = []
x1 = []
x2 =[]
y = []
#i=4
for n in range(50,55): # started with 50 itens until 54 itens in the list
  lprofiler = LineProfiler()
  lp_wrapper = lprofiler(boyer_moore)

  text, pat = generate_pat(n)

  lp_wrapper(text, pat)

  stats = lprofiler.get_stats()
  line_numbers = []
  hits = []

  for line in stats.timings.values():
    for i in line:
      line_numbers.append(i[0])
      hits.append(i[1])

  x1.append(n)
  x2.append(n*2)
  y.append(sum(hits))

X_reshaped = np.column_stack((x1, x2))
y_np = np.array(y)


In [14]:
resultados_com_menor_loss = []
repeat = 5
registros = []

original_stdout = sys.stdout

with open(os.devnull, 'w') as devnull:
  sys.stdout = devnull

  for i in range(repeat):

    # first combination
    reg1 = PySRRegressor(
      binary_operators=["*", "+"],
      unary_operators=["log", "square", "cube"],
    )

    fit1 = reg1.fit(X_reshaped, y)
    best_program1 = fit1.get_best()

    registro1 = []
    for index, value in enumerate(best_program1):
      registro1.append(value)
      
    registros.append(registro1)

    # second combination
    reg2 = PySRRegressor(
      binary_operators=["*"],
      unary_operators=["log", "square", "cube"],
    )

    fit2 = reg2.fit(X_reshaped, y)
    best_program2 = fit2.get_best()

    registro2 = []
    for index, value in enumerate(best_program2):
      registro2.append(value)
    registros.append(registro2)

    # third combinarion
    reg3 = PySRRegressor(
      binary_operators=["+"],
      unary_operators=["log", "square", "cube"],
    )

    fit3 = reg3.fit(X_reshaped, y)
    best_program3 = fit3.get_best()

    registro3 = []
    for index, value in enumerate(best_program3):
      registro3.append(value)
    registros.append(registro3)
    
sys.stdout = original_stdout



[ Info: Started!
0.0%┣                                              ┫ 0/600 [00:00<00:-5, -0s/it]Expressions evaluated per second: [.....]. Head worker occupation: 0.0%         Press 'q' and then <enter> to stop execution early.                             Hall of Fame:                                                                   ---------------------------------------------------------------------------------------------------                                                             Complexity  Loss       Score     Equation                                       1           4.979e+11  1.594e+01  y = x₀                                        2           3.254e+11  4.253e-01  y = cube(x₀)                                  4           4.759e+10  9.612e-01  y = (square(x₁) * x₀)                         6           4.687e+10  7.557e-03  y = (square(x₁) * (x₀ + 0.2801))              7           3.495e+10  2.935e-01  y = ((square(x₁) * x₀) + cube(x₀))            8           3.356e+10  

In [15]:
registros_ = registros

In [16]:
for i in registros_:
  loss = i[1]
  score = i[2]
  complexity = i[0]
  w = (loss * score)/complexity
  if math.isnan(w):
    i.append(0)
  else:
    i.append(w)

lista_melhor_valor = max(registros_, key=lambda x: x[6])

## Save result

In [17]:
def salvar_dados(dados, key, arquivo):
  if os.path.exists(arquivo):
    with open(arquivo, 'rb') as f:
      dados_exist = pickle.load(f)
  else:
    dados_exist = {}
    
  valor_original = dados_exist.get(key)
  if valor_original == None:
    dados_exist.update({key: [dados]})
  else:
    if isinstance(valor_original, list):
      valor_original.append(dados)
    else:
      dados_exist.update({key: [dados]})

  with open(arquivo, 'wb') as f:
    pickle.dump(dados_exist, f)

caminho_arquivo = 'dados.pickle'
novos_dados = lista_melhor_valor[0:3] + [lista_melhor_valor[4]]

salvar_dados(novos_dados, 'boyer_moore_M_plus_N', caminho_arquivo)

In [18]:
def carregar_dados(arquivo):
    # Carrega os dados do arquivo pickle
    with open(arquivo, 'rb') as f:
        dados = pickle.load(f)
    return dados

caminho_arquivo = 'dados.pickle'
dados_carregados = carregar_dados(caminho_arquivo)

print("Conteúdo do arquivo pickle:")
for k, v in dados_carregados.items():
  print('\u25CF', k)
  for index, item in enumerate(v):
    if index == len(v)-1:
       print('└─', item)
    else:
      print('├─', item)
  print('==========================')

Conteúdo do arquivo pickle:
● binary_search_logx
├─ [6, 1.2914093, 1.37589542429269, log(x0**6) + 5.574131]
└─ [5, 1.5291592, 2.19479656667366, log(x0)**2 + 13.966276]
● boyer_moore_M_plus_N
├─ [9, 1729031400.0, 0.1972007458935316, 752018330.701125*(0.00603869219560025*x0 + 0.00603869219560025*x1 - 1)**4 + 601415.75]
└─ [5, 15754614000.0, 0.3055240605113849, 3621132.2 - 28185.828*x1]
