##### Copyright 2023 Google LLC

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Setup

### Install & import

In [1]:
!pip install -U -q google-generativeai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.2/164.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.3/718.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -U -q PyPDF2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/232.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Install the client library and import necessary modules.
import google.generativeai as genai

import os
from typing import Dict
from google.colab import drive
from io import BytesIO
import PyPDF2

import base64
import io
import json
import mimetypes
import pathlib
import pprint
import requests
import pandas as pd
import numpy as np

import PIL.Image
import IPython.display
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Markdown

In [4]:
from google.colab import auth
auth.authenticate_user()

In [5]:
from googleapiclient.discovery import build

drive_service = build('drive', 'v3')
files = drive_service.files().list().execute()

In [6]:
from google.colab import userdata

API_KEY=userdata.get('key090524')

In [7]:
# Configure the client library by providing your API key.
genai.configure(api_key=API_KEY)

In [8]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive




---
⏰# **LONG PREP STEP STARTS HERE**: ⏰


The next cells will perform a code that takes some time. It will read each pdf, capture the info in the text and split the text into smaller chunks increasing the length of the dataframe. After this there will be also a step to embbed each of the phares, this is so far the longest step in the process. If you have already used the code you will have the whole dataframe with embeddings saved as csv (or any other format of your preference) so it is better to just load it from there.

In [None]:
def pdf_to_text_dict(folder_path: str) -> Dict[str, str]:
  """
  Parses all PDF files within a Google Drive folder and returns a dictionary.

  Args:
      folder_path: Path to the folder containing the PDF files within Google Drive.

  Returns:
      A dictionary where the key is the PDF file name and the value is the extracted text.
  """

  pdf_text_dict = {}

  for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
      filepath = os.path.join(folder_path, filename)

      with open(filepath, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        num_pages = len(pdf_reader.pages)
        file_content = ""

        for page_num in range(num_pages):
          page = pdf_reader.pages[page_num]
          file_content += page.extract_text()

      pdf_text_dict[filename] = file_content

  return pdf_text_dict

# Replace with the actual path to your Google Drive folder
folder_path = '/gdrive/MyDrive/Colab Notebooks/bulas'

pdf_dict = pdf_to_text_dict(folder_path)

In [None]:
old_keys = list(pdf_dict.keys())
new_keys = ['paracetamol','acido acetilsalicilico','metamizol','diclofenaco','amoxicilina']
for k in range(len(old_keys)):
  pdf_dict[new_keys[k]] = pdf_dict[old_keys[k]]
  del pdf_dict[old_keys[k]]

In [None]:
def clean_text(text):
  """Removes extra lines, line breaks, and spaces from text."""
  text = text.replace('\n', ' ')  # Replace line breaks with spaces
  text = ' '.join(text.split())   # Remove extra spaces
  return text

pdf_dict_cleaned = {filename: clean_text(text) for filename, text in pdf_dict.items()}

In [None]:
df_bulas = pd.DataFrame.from_dict(pdf_dict_cleaned, orient = 'index', columns = ['texto'])
df_bulas = df_bulas.reset_index()
df_bulas = df_bulas.rename(columns = {'index':'medicamento'})
df_bulas.head()

Unnamed: 0,medicamento,texto
0,paracetamol,"Indicações Em adultos, para a redução da febre..."
1,acido acetilsalicilico,anti-inflamatórios/anti-reumáticos e a outros ...
2,metamizol,dipirona monoidratada Brainfarma Indústria Quí...
3,diclofenaco,Blau Farmacêutica S/A. DICLOFENACO SÓDICO Blau...
4,amoxicilina,amoxicilina tri-hidratada Bula para profission...


In [None]:
import re

# Function to split text into phrases
def split_into_phrases(text):
    return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)

# Function to generate a key name from a phrase
def generate_key_name(phrase, max_words=5):
    words = phrase.split()
    return ' '.join(words[:max_words])

# List to store new rows
new_rows = []

# Process each row in the DataFrame
for idx, row in df_bulas.iterrows():
    phrases = split_into_phrases(row['texto'])
    for phrase in phrases:
        if phrase:  # Ensure phrase is not empty
            key_name = generate_key_name(phrase)
            new_rows.append({'medicamento': row['medicamento'] + '_' + key_name, 'texto': phrase})

# Create a new DataFrame with the new rows
df_bulas_split = pd.DataFrame(new_rows)

In [None]:
df_bulas_split.head(20)

Unnamed: 0,medicamento,texto
0,"paracetamol_Indicações Em adultos, para a","Indicações Em adultos, para a redução da febre..."
1,paracetamol_Contra-indicações O paracetamol nã...,Contra-indicações O paracetamol não deve ser a...
2,paracetamol_Advertências Não use outro medicam...,Advertências Não use outro medicamento que con...
3,paracetamol_Precauções Embora o paracetamol possa,Precauções Embora o paracetamol possa ser util...
4,paracetamol_A administração deve ser feita,A administração deve ser feita por períodos cu...
5,paracetamol_O paracetamol não deve ser,O paracetamol não deve ser administrado por ma...
6,paracetamol_Usuários crônicos de bebidas alcoó...,Usuários crônicos de bebidas alcoólicas podem ...
7,paracetamol_O paracetamol pode causar dano,O paracetamol pode causar dano hepático.
8,paracetamol_Reações adversas Podem ocorrer alg...,Reações adversas Podem ocorrer algumas reações...
9,paracetamol_Caso ocorra uma rara reação,"Caso ocorra uma rara reação de sensibilidade, ..."


In [None]:
model = 'models/embedding-001'

In [None]:
def embedding_create(title, text):
  return genai.embed_content(model = model,content = text, title = title, task_type ='RETRIEVAL_DOCUMENT')['embedding']

In [None]:
from google.api_core.exceptions import BadRequest
import google.generativeai as palm

def embed_text_with_title(text, title, chunk_size=500):
  """Embeds text in chunks, incorporating the title."""
  text = f"{title}. {text}" # Prepend title to the text
  words = text.split()
  embeddings = []
  for i in range(0, len(words), chunk_size):
    chunk = ' '.join(words[i:i + chunk_size])
    try:
      response = genai.embed_content(model="models/embedding-001", content=chunk, task_type ='RETRIEVAL_DOCUMENT')
      embeddings.append(response['embedding'])
    except BadRequest as e:
      print(f"Error embedding chunk: {e}")
      return None

  embeddings = [emb for emb in embeddings]
  #embeddings = embeddings.flatten()
  # Calculate max_length based on the length of the embeddings list
  max_length = max([len(emb) for emb in embeddings])
  # Pad the embeddings with zeros to the same length
  padded_embeddings = [np.pad(emb, (0, max_length - len(emb)), 'constant') for emb in embeddings]

  #new_embeddings = []
  #for emb in padded_embeddings:
  #  for i in emb:
  #    new_embeddings.append(i)


  return padded_embeddings


# **After running the bellow step for the first time the next code box will save it as CSV. Skip the bellow cell if you have already ran it and have the CSV. The bellow cell takes me up to 20 min to conclude running.**

In [None]:
df_bulas_split['embeddings'] = df_bulas_split.apply(lambda row: embed_text_with_title(row['texto'], row['medicamento']), axis=1)
df_bulas_split.head()

NameError: name 'df_bulas_split' is not defined

In [None]:
df_bulas_split.to_csv('/gdrive/MyDrive/Colab Notebooks/bulas/df_bulas_split.csv', index=False)

**If you already ran the code above you can continue from here and read the csv file directly to save time**



---
✅ # **LONG PREP STEP ENDS HERE** ✅



In [9]:
#/gdrive/MyDrive/Colab Notebooks/PJT chatbot alura.ipynb
df_bulas_split = pd.read_csv('/gdrive/MyDrive/Colab Notebooks/bulas/df_bulas_split.csv')

Bellow is a test to find a medicine with the normal string finding from pandas

In [10]:
df_bulas_split[df_bulas_split['medicamento'].str.contains('acetil',case=False, na=False)]

Unnamed: 0,medicamento,texto,embeddings
28,paracetamol_A acetilcisteína a 20 %,A acetilcisteína a 20 % deve ser administrada ...,"[array([ 2.87145230e-02, -3.59287750e-02, -6.8..."
29,"paracetamol_A acetilcisteína a 20%, deve","A acetilcisteína a 20%, deve ser dada após dil...","[array([ 1.98771400e-02, -3.66042330e-02, -6.5..."
30,paracetamol_Além da administração da acetilcis...,"Além da administração da acetilcisteína a 20%,...","[array([ 4.22642500e-02, -2.51915730e-02, -6.6..."
252,acido acetilsalicilico_anti-inflamatórios/anti...,anti-inflamatórios/anti-reumáticos e a outros ...,"[array([ 2.54086200e-03, -4.10816800e-02, -5.5..."
253,acido acetilsalicilico_derivados cumarínicos o...,derivados cumarínicos ou heparina – exceto ter...,"[array([ 0.02998589, -0.04108284, -0.06465814,..."
...,...,...,...
663,acido acetilsalicilico_Precauções e advertênci...,Precauções e advertências: Somente após rigoro...,"[array([ 0.00122238, -0.06297091, -0.05041644,..."
664,acido acetilsalicilico_Embalagem _____________...,Embalagem ___________________________ _____/__...,"[array([ 0.01366628, -0.02074629, -0.05128136,..."
665,acido acetilsalicilico_Marketing _____________...,Marketing ______________________________ _____...,"[array([ 1.25781390e-02, -3.53270060e-02, -4.4..."
666,acido acetilsalicilico_Galênico_______________...,Galênico_____________________________ _____/__...,"[array([ 1.98976730e-02, -5.08812550e-02, -7.2..."


In [11]:
import ast
#As from CSV everything was converted to string some changes need to be done in the dataframe
df_bulas_split['embeddings'] = df_bulas_split['embeddings'].str.replace('\n', '')
df_bulas_split['embeddings'] = df_bulas_split['embeddings'].str.replace('[array([', '')
df_bulas_split['embeddings'] = df_bulas_split['embeddings'].str.replace('])', '')
df_bulas_split['embeddings'] = df_bulas_split['embeddings'].str.replace('[', '')

In [12]:
df_bulas_split['embeddings'][1]

' 2.47939000e-02, -3.63911800e-02, -7.94218100e-02,  9.13854000e-03,        1.01060930e-01,  5.40322530e-03, -1.73252770e-02, -1.41867640e-02,       -1.91202900e-02,  6.16487000e-02, -2.17315870e-02,  1.28175090e-02,       -8.60075200e-04,  6.81002700e-04, -2.56608840e-02, -5.45868280e-02,       -1.12370600e-03,  1.66438180e-02, -4.76897370e-03, -2.66084100e-02,        8.93574800e-03,  2.19835020e-02, -1.69324630e-02,  1.73874940e-02,        1.74775700e-03, -2.21421510e-02, -2.92698420e-05, -6.56498900e-02,       -2.20014550e-02, -3.24690090e-03, -2.55385940e-02,  7.09511700e-02,       -4.30718440e-02,  6.62153170e-03, -2.19775850e-02, -3.39481050e-02,       -1.59400100e-02,  1.12673080e-02, -4.25842100e-02,  2.21631970e-02,        2.88319480e-02, -5.80330000e-03, -2.76516080e-02,  3.58850960e-02,       -2.51906100e-02,  1.21689040e-02, -5.58341780e-02,  4.31890650e-03,       -3.67837330e-03, -6.95855700e-02,  8.65179800e-03,  2.13448820e-02,        8.03641160e-02, -4.38472440e-02,  8.

In [19]:
copy = df_bulas_split['embeddings'].str.split(',',expand = True)
copy.columns

RangeIndex(start=0, stop=768, step=1)

In [20]:
for c in range(0,768):
  copy[c] = copy[c].str.replace(']','')
  copy[c] = copy[c].astype(float)

In [21]:
copy.values.to_list()

From csv it is converting to text when reading so this needs to be fixed further the line so the code works. Should be list format or something like that

In [None]:
from google.api_core.exceptions import BadRequest
import google.generativeai as palm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast

def embed_text_with_title(text, title, chunk_size=500):
  """Embeds text in chunks, incorporating the title."""
  text = f"{title}. {text}" # Prepend title to the text
  words = text.split()
  embeddings = []
  for i in range(0, len(words), chunk_size):
    chunk = ' '.join(words[i:i + chunk_size])
    try:
      response = palm.embed_content(model="models/embedding-001", content=chunk, task_type ='RETRIEVAL_DOCUMENT')
      embeddings.append(response['embedding'])
    except BadRequest as e:
      print(f"Error embedding chunk: {e}")
      return None

  # Flatten the list of embeddings into a single 1D array
  embeddings = np.concatenate(embeddings)
  return embeddings

def get_relevant_phrases(prompt, df_bulas):
  """
  Retrieves the most relevant phrases from the "texto" column based on a prompt.

  Args:
      prompt: A string containing the prompt.
      df_bulas: A pandas DataFrame containing the "medicamento" and "texto" columns.

  Returns:
      A list of the most relevant phrases.
  """

  # Extract the medicamento name from the prompt
  medicamento = prompt.split()[0]

  # Get the embeddings for the prompt and the medicamento text
  prompt_embeddings = embed_text_with_title(prompt, medicamento).reshape(1, -1) # Reshape to 2D
  #print(prompt_embeddings)
  # Extract embeddings and handle potential list of lists
  medicamento_embeddings = df_bulas[df_bulas['medicamento'].str.contains(medicamento, case = False, na = False)]['embeddings'].values
  #medicamento_embeddings = np.array([ast.literal_eval(emb) for emb in medicamento_embeddings])

  if isinstance(medicamento_embeddings[0], list):
    medicamento_embeddings = np.array([np.concatenate(emb) for emb in medicamento_embeddings])
  else:
    medicamento_embeddings = np.stack(medicamento_embeddings, axis=0) # Stack embeddings into 2D array

  # Calculate the cosine similarity between the prompt and each phrase in the medicamento text

  similarities = cosine_similarity(prompt_embeddings, medicamento_embeddings)

  # Sort the phrases by their similarity scores and return the top 5
  sorted_indices = np.argsort(similarities.flatten())[::-1][:5]

  relevant_phrases = []
  for index in sorted_indices:
    relevant_phrases.append(df_bulas.iloc[index]['texto'])

  return relevant_phrases

In [None]:
# Example usage
prompt = "paracetamol adverso"
relevant_phrases = get_relevant_phrases(prompt, df_bulas_split)

# Print the most relevant phrases
for phrase in relevant_phrases:
  print(phrase)

ValueError: could not convert string to float: '[array([ 9.45394800e-03, -4.64319800e-02, -7.72205800e-02,  1.57492440e-02,        9.76961100e-02,  2.03925240e-02, -2.54433420e-02, -1.03905440e-02,       -2.00317870e-02,  9.35516300e-02, -1.25398360e-02,  2.14342160e-02,       -2.06985230e-02,  1.67553310e-02, -4.84288450e-02, -5.15675300e-02,       -1.21420820e-02,  7.48475600e-04, -2.12934670e-02, -1.69793130e-02,       -2.51965530e-03,  6.76740300e-03, -2.25207950e-02,  4.35570100e-02,        2.09687100e-04, -2.29196720e-02,  7.18679000e-03, -6.64589700e-02,       -2.66536370e-04,  1.23224620e-02, -2.73203960e-02,  5.86651820e-02,       -2.35326220e-02, -1.44412610e-02,  1.09296470e-03, -3.15908400e-02,       -1.97403880e-02, -5.55518830e-03, -2.99533480e-02,  3.19716400e-02,        1.00455220e-02, -1.82258260e-02, -1.56531500e-02,  6.28074260e-02,       -1.74021100e-02,  7.80431500e-03, -5.09146040e-02,  2.10228730e-02,        2.14409920e-03, -6.34366100e-02,  1.37802550e-02,  1.39095880e-02,        1.03022665e-01, -2.91840490e-02, -3.86537840e-03, -7.67853600e-02,        5.49683200e-02, -5.74589630e-02, -7.52493370e-03,  2.06006700e-02,       -6.01097000e-02,  2.34704040e-02,  2.48975290e-02,  4.80185670e-02,       -1.93026500e-02, -5.28922830e-02, -1.61866270e-02, -2.58060580e-02,        4.19996300e-02,  2.09060670e-02,  2.82573120e-02, -2.24245230e-02,        3.72991820e-02, -9.71531400e-03, -1.23408950e-02, -9.41657400e-02,       -2.24753640e-02,  4.79500800e-02,  1.51923420e-02,  1.32367500e-02,       -2.36839650e-02, -6.15813600e-02, -4.30495900e-02, -2.79040150e-02,       -2.78833790e-02,  3.21075200e-02, -4.92324600e-02, -9.23258200e-03,        1.86289270e-02,  3.55442300e-03, -1.39727820e-02, -3.57812300e-02,        3.26475540e-02, -5.21086200e-02,  6.20749800e-04,  9.32378600e-02,       -5.10877400e-02, -8.03275900e-03,  5.79551530e-02, -2.38281770e-03,        2.33849860e-02, -5.93551600e-03, -4.72573000e-02, -7.95002500e-04,       -9.45523800e-03,  4.41380800e-02, -5.09975540e-04,  5.19229300e-02,       -1.18709770e-03,  4.97158170e-02, -6.89275860e-02,  1.79752710e-03,       -4.37014550e-03,  9.01713100e-03,  6.09515530e-02, -1.47079120e-02,        1.46672850e-03,  4.35750700e-02,  3.43015200e-02,  4.77056240e-02,        2.61084680e-02,  1.17059830e-02,  7.76867300e-02, -4.18079400e-02,       -2.33785040e-02, -4.29227250e-03,  8.09270900e-03,  1.35463100e-02,        1.35566965e-02,  1.54405450e-02, -1.33795240e-02, -8.64781260e-04,       -2.31328350e-02, -1.32757280e-02,  7.05248300e-02,  1.12164390e-01,        4.32852660e-02,  2.02036180e-02,  1.69637280e-02,  6.83995400e-02,        3.81462020e-03,  5.12446130e-02,  3.52464540e-02, -9.13513600e-03,        2.48159400e-02,  7.57233100e-02, -4.55204550e-02, -1.21577470e-02,        7.99000200e-02, -5.20423700e-03,  2.12658010e-02,  2.81273130e-03,       -4.25282050e-02, -2.36900260e-02,  8.78719900e-02,  2.21457240e-02,       -6.84675430e-03,  3.40808150e-02,  1.79850470e-03,  3.22034000e-02,        4.86270400e-02,  1.00852830e-02,  1.09125840e-03, -6.29392640e-03,        1.80074070e-02, -6.59352770e-03,  2.13981460e-02, -8.15924700e-03,        2.37716290e-02,  1.47929890e-02, -4.16517900e-03,  2.26778910e-02,       -4.51647940e-02, -2.24374960e-02,  2.04104870e-02, -4.71116680e-02,        6.23845230e-02,  3.59922460e-02, -9.56904900e-02, -1.99370400e-02,       -4.88847770e-03, -1.09463430e-02,  2.08849400e-02,  3.83975250e-02,       -1.11320770e-02, -8.37238800e-02,  6.10615050e-02, -3.11677480e-03,       -6.22490980e-02, -8.16737000e-03,  1.00790410e-02, -2.44168020e-02,       -2.39645190e-02, -2.99299800e-02, -7.65830300e-02,  2.66812690e-02,        1.20218340e-02,  3.18566450e-02,  2.05217800e-02, -6.37073300e-02,       -1.94218360e-03,  4.30895130e-02, -9.00820400e-03, -2.40777920e-02,        7.76727360e-03,  1.12968520e-03,  6.16633260e-02, -5.11166040e-02,       -5.22063260e-02,  5.48645300e-02, -5.73299680e-02,  6.26827800e-02,        1.48536110e-02, -6.72508100e-03,  1.86686310e-02,  5.43314270e-03,       -1.02159730e-02, -5.54607340e-03,  1.94334430e-02, -4.19669260e-02,       -7.22638200e-02,  1.21803260e-02, -4.64608000e-02, -8.87007100e-03,       -5.27269240e-02,  3.95373200e-02, -1.38476760e-02, -1.84463560e-02,        2.34143570e-02, -2.79934780e-02,  8.67353100e-03,  8.42579300e-02,        4.81273460e-02, -2.99717670e-02,  5.23328040e-02, -1.33361730e-02,       -5.13559100e-03,  1.78169480e-02,  7.15168640e-02,  6.40352400e-02,       -3.37173500e-02,  4.07963840e-02,  1.05673750e-02,  1.88370320e-03,       -2.74505080e-02, -6.12267700e-02, -3.26548300e-02,  8.17061200e-02,        2.30757600e-02,  3.93417400e-02,  9.22622300e-03,  1.94543860e-03,       -1.83227700e-02, -3.51873380e-02, -6.30832000e-02,  5.39375900e-02,       -3.31421300e-02,  1.71395060e-02, -1.35857790e-02,  2.14036480e-03,        4.86388240e-02, -1.27671160e-02,  1.45741250e-02, -1.36193600e-02,       -1.82739050e-02, -1.32052440e-02,  2.54672240e-02, -7.45373140e-02,        2.92791970e-02,  1.37775390e-02, -5.90558440e-03, -3.05468750e-03,        5.28384150e-02,  9.07266200e-03,  2.92992240e-03, -1.80998560e-02,       -1.91560570e-02,  2.94793380e-02,  4.56235630e-02, -5.11092660e-02,       -5.09571700e-03,  1.83975580e-02,  2.08997260e-02, -1.94787630e-02,        7.23847230e-03, -3.71761000e-02, -5.61541430e-02, -5.18127940e-02,        2.40335280e-02, -2.43918750e-02,  5.54875500e-03, -4.69954650e-02,        6.07364170e-04, -2.36240550e-02, -9.74644500e-03, -1.51802760e-03,        8.40121000e-03,  6.15929000e-02,  3.57837300e-02, -3.37395560e-02,        3.47197350e-02, -2.45941910e-02,  2.78237740e-02, -6.45907800e-02,        7.72703760e-03,  3.54603100e-02,  6.54926100e-03, -1.62314120e-02,        5.82544830e-03,  3.19411070e-03,  8.75731500e-03,  9.49617850e-03,       -5.05910140e-02, -1.51596820e-03,  4.26025870e-02,  4.03367300e-02,       -4.63846000e-02,  4.04627700e-02, -1.81078840e-02,  3.65690660e-02,        2.50273800e-02,  9.42358600e-02, -4.21402800e-03,  2.24808180e-02,        3.66033030e-03,  4.76080700e-02, -7.38538100e-03,  4.90493850e-02,       -3.25147030e-02,  2.09870330e-03, -1.79197770e-02,  4.56210970e-03,       -4.68586980e-02,  1.74211820e-03, -6.66620070e-03,  1.36084870e-03,       -8.96809250e-02,  6.48258070e-03, -2.30614990e-02, -8.74906300e-03,        1.47561050e-02,  2.93234030e-02, -3.78705040e-02, -3.11906020e-02,        2.46772710e-02,  5.00753670e-02, -1.56084250e-02,  1.96364600e-02,        6.71651440e-02,  4.60581000e-02,  4.38353700e-04,  3.45285460e-02,        1.35642410e-02, -3.36330870e-03, -9.66471700e-04, -2.49258470e-02,        4.12232700e-02, -7.90196300e-04,  5.20066400e-02, -4.58879140e-02,       -8.34665600e-04,  4.22701000e-03,  7.92121460e-04,  9.78368700e-03,        3.43210060e-02,  4.23992500e-03, -2.36135160e-03,  2.48323570e-04,       -2.07475500e-02,  6.81269300e-02,  2.33778670e-02, -7.30155700e-03,        3.74792300e-02, -1.70292690e-02,  2.36205400e-02, -1.20402760e-02,        1.41879550e-02, -3.20741830e-02, -8.26827100e-03,  1.08924220e-02,       -2.23310460e-02, -3.76668900e-02,  5.26196920e-02,  2.79532690e-02,       -3.13159260e-02, -2.75369510e-02,  3.09584160e-02,  2.26594770e-02,        2.06959840e-04,  2.80829090e-02, -5.81434150e-02, -4.02387840e-03,        7.37915300e-02, -6.26753640e-03, -2.96419560e-02, -3.08647290e-02,       -3.59763350e-02, -2.61878560e-02,  6.27411250e-03,  3.16202450e-02,       -3.99614450e-03, -3.71754580e-02, -3.31213770e-02, -1.02348480e-02,       -6.81014060e-02,  5.74489100e-03, -1.87651980e-02, -9.00740300e-03,       -5.67305130e-02, -9.02188800e-03, -4.14697500e-02, -1.14859620e-02,        2.49601210e-02, -6.74066500e-02, -3.22533660e-02, -9.52265100e-03,        1.02759550e-03, -1.36129870e-02, -2.47228540e-02,  2.89544760e-02,       -3.95407680e-02, -1.23490810e-02,  3.51787430e-02,  9.99065300e-03,       -4.44723220e-02, -7.62418600e-02,  1.71761870e-02, -6.89873200e-03,        1.93266890e-02,  4.54876700e-02,  3.64166350e-02,  2.68053300e-03,        5.25515860e-02, -1.98032820e-02, -1.03823310e-02, -5.63642530e-02,       -2.77505980e-03,  9.02965200e-04,  1.43792280e-02, -1.73353070e-02,        4.36907000e-02,  1.88926460e-02,  3.43338030e-02,  3.91301330e-02,       -3.27673140e-02, -7.91131100e-02, -1.68070200e-02, -1.12362060e-02,        9.64044300e-03, -6.45694600e-02,  5.01357880e-02, -5.12874760e-02,       -4.26317860e-02, -2.83593010e-02,  1.46465780e-02, -2.11730730e-02,       -2.45967820e-02,  5.92059340e-02, -1.33468390e-02, -3.14495270e-03,        1.40579900e-02, -4.88225560e-02,  2.02358590e-02, -6.98802600e-02,        5.52059450e-02,  1.26813330e-02,  1.05999710e-02,  1.96122470e-02,       -4.11130530e-02,  4.53070330e-02,  5.18875500e-02, -4.73033700e-02,        3.26319400e-02,  1.49477930e-03, -8.69523100e-03, -6.07958670e-03,       -6.41794200e-02,  3.93458900e-02, -4.41661850e-02, -4.59763440e-02,       -4.12452640e-03, -5.99024700e-03,  4.41849340e-02,  1.07778040e-02,       -9.51913700e-03,  3.03108560e-02, -2.68203220e-02,  7.50247340e-03,        7.75934570e-03,  9.80238200e-03, -2.03775190e-02, -3.90123800e-02,       -3.55051460e-02, -5.74608100e-02, -3.10909790e-02,  1.68003640e-02,       -3.05744970e-02,  2.05705100e-03,  2.48398040e-02,  9.23045600e-03,       -5.28432430e-03,  2.57933600e-02,  9.83326000e-03, -7.93087300e-03,        4.36003660e-02, -4.01888300e-02,  2.66718830e-02,  2.77064070e-02,       -2.61271470e-02,  3.00172130e-03,  1.59390270e-02, -9.41844800e-03,       -1.29220930e-02,  4.04926660e-04,  2.50956570e-02, -1.63106500e-02,        5.59074400e-02,  3.62189840e-03, -2.15798360e-02,  9.54506100e-03,        2.33739740e-02, -2.22920890e-02, -1.21971935e-01,  1.73168700e-02,        7.73619500e-02, -5.59244680e-02,  1.94329730e-02,  3.72086430e-02,       -6.80082440e-02,  4.31584860e-02, -1.57800470e-03,  4.36750720e-02,       -8.63733800e-02, -1.60202740e-02,  1.88255140e-02,  2.35377270e-02,        1.10626170e-02,  5.88682060e-03,  4.45590240e-02, -1.95874050e-02,        2.42880730e-02,  2.54720730e-03,  6.11966300e-02,  6.26474800e-02,       -4.05177100e-02,  3.42573230e-02,  1.95517430e-02, -7.32775500e-02,       -2.49297240e-03, -1.36881390e-02,  2.74295770e-02, -1.03324670e-02,        2.65757370e-02, -2.60337390e-02,  4.30887450e-02, -7.48481830e-03,        2.43870900e-02,  1.71940030e-02,  2.74906720e-02, -4.37474000e-02,       -3.91320850e-02,  1.42134540e-02, -2.63546690e-02,  2.69544680e-02,        5.61067800e-02,  4.81121500e-02, -2.96483860e-02, -2.58814520e-02,        5.79178700e-02, -9.89420800e-03,  4.56649550e-02, -3.28433900e-03,        1.67039240e-02,  3.95462100e-02,  2.14719740e-02, -6.37003400e-02,       -2.66238000e-02,  1.02032630e-02,  2.12811000e-03, -5.71047850e-02,        2.05585570e-02, -6.33585100e-03, -1.86317810e-03,  7.81106000e-03,       -3.87276630e-02,  8.79107200e-03,  6.24058430e-02,  6.14571600e-02,        5.71413900e-03,  7.60847400e-03, -1.84361500e-02,  5.53739070e-02,       -3.62302960e-02,  1.05834540e-02, -5.78341070e-03,  1.20459970e-02,        9.50309000e-03, -1.42104510e-02, -2.99810470e-02, -1.58800280e-02,        2.07338460e-02,  9.40005950e-03,  2.52988130e-02,  2.02691980e-02,        6.60261800e-02,  5.35472300e-02,  6.25767930e-03, -3.06735370e-02,       -4.93501400e-02, -2.23620490e-04, -2.32450630e-02, -2.07010250e-02,       -2.41773070e-02, -1.17010560e-02, -7.21746500e-02, -3.31285400e-02,        5.76883100e-02, -2.38328330e-03, -3.45729480e-02, -2.40453430e-02,       -3.05334690e-03,  3.93328700e-02,  2.16782600e-02, -1.74124100e-02,        3.14338300e-02,  1.93767880e-02, -1.06861810e-02, -5.22887470e-03,        7.96935000e-02,  2.08175280e-02,  2.05826480e-02,  5.56135250e-02,        1.49814880e-02,  7.44883640e-02, -2.88914350e-02, -2.20702760e-02,       -6.03627380e-02,  1.39886770e-02,  2.27518550e-02, -4.73365600e-02,       -6.20422000e-02,  2.78252500e-02,  2.91811250e-02, -1.06260440e-02,       -3.88939050e-02,  7.23233100e-02, -9.70572200e-03, -9.37499250e-02,       -5.35324200e-02, -2.28058210e-02,  5.53222600e-03, -2.94612660e-02,        2.32358850e-02,  2.55540820e-05,  1.67129600e-02, -2.04319080e-02,       -9.03540200e-03, -4.37062000e-02, -1.96239930e-03, -7.74035370e-03,       -5.46701070e-02, -2.48881100e-02,  1.05187750e-02,  8.82879600e-03,       -5.68876830e-02,  4.79670880e-04, -3.44654600e-02, -4.20375880e-02,       -4.61200500e-02,  4.82559280e-02, -4.61239780e-02, -1.19639740e-02,        5.91715870e-02, -7.33308300e-03,  4.66606000e-02, -7.94580400e-03,       -3.27115730e-02,  2.04209930e-02,  5.27811240e-03,  2.01847570e-02,       -1.21435330e-02, -4.45008800e-03, -2.73583420e-02, -2.59911940e-03,       -4.54128800e-02,  2.97254750e-02,  9.38501000e-03,  5.59614900e-03,       -2.91149410e-02, -6.15044300e-02, -5.49988170e-03,  3.32888470e-03,       -1.53430100e-02,  4.07973570e-02,  3.15042400e-02,  2.40394030e-02,       -3.57675200e-02, -1.07625990e-03, -9.29615700e-03,  8.98888100e-03,       -2.74379980e-02, -3.87686300e-02, -3.20417320e-03,  1.20103690e-02,        2.43684170e-02, -1.95057380e-03,  2.64644300e-02, -1.01975270e-02,        6.72860740e-02,  4.81403320e-02,  3.81600800e-02, -2.45346370e-02,       -6.25688900e-02,  9.60534700e-02,  3.71663640e-02, -6.42557140e-02,       -3.14631700e-03,  5.84471340e-02, -1.34705970e-02,  4.70788370e-02,        5.02894740e-02,  5.39153740e-02,  2.26332660e-02, -3.70753440e-02,       -4.11147770e-02,  1.51048070e-02, -7.13385500e-03, -3.19205930e-02,       -4.06424000e-02,  5.55597700e-03,  5.21813330e-02, -8.48465500e-03,       -2.25903540e-02,  3.82092060e-03, -3.58822640e-02,  1.00056574e-01,        3.28542850e-02,  4.50118220e-02,  4.74967960e-04, -7.62226500e-02,       -3.87060050e-02, -2.98240150e-02,  1.74230520e-02,  4.58784480e-02,        1.34706170e-02,  1.74041140e-03, -4.08027600e-02, -2.66632380e-02,        2.85868120e-02, -5.80240000e-02, -2.42392720e-02, -1.88374690e-02,        3.54447920e-02,  8.03050600e-02,  8.67325500e-02, -2.25485800e-02,       -2.01712590e-02, -2.41170080e-02,  3.87762860e-02, -1.23361135e-02,       -2.48058270e-02,  1.67453400e-02,  1.97002600e-03,  3.26625070e-03,        2.85321770e-02,  1.10476315e-02, -5.67726500e-02,  4.24600200e-02])]'

In [None]:
#max_length = max([len(emb) for emb in df_bulas['embeddings']])
#df_bulas['embeddings'] = [np.pad(emb, (0, max_length - len(emb)), 'constant') for emb in df_bulas['embeddings']]
from keras.preprocessing.sequence import pad_sequences

In [None]:
def generate_query(query, base, model):
  embedding_query = genai.embed_content(model = model,content = query, task_type ='RETRIEVAL_QUERY')
  #print(embedding_query)
  #print(f"Shape of df_bulas['embeddings'][0]: {df_bulas['embeddings'].shape}")
  #print(f"Shape of embedding_query['embeddings']: {len(embedding_query['embedding'])}")
  max_length = max([len(emb) for emb in df_bulas_split['embeddings']])
  df_embeddings = pad_sequences(df_bulas_split['embeddings'], maxlen=max_length, padding='post')
  scalar_product = np.dot(np.stack(df_embeddings), embedding_query["embedding"])
  index = np.argmax(scalar_product)
  return df_bulas_split.iloc[index]['texto']

In [None]:
#query = 'how to change gears in the car'
#generate_query(query,df,model)
prompt = input("Prompt: ")
while prompt.upper().strip() != "PARAR":
  response = generate_query(prompt,df_bulas,model)
  print("Reposta: ","\n",response,"\n")
  prompt = input("Prompt: ")

Prompt: diclofenaco dose


ValueError: invalid literal for int() with base 10: '[array([ 9.45394800e-03, -4.64319800e-02, -7.72205800e-02,  1.57492440e-02,\n        9.76961100e-02,  2.03925240e-02, -2.54433420e-02, -1.03905440e-02,\n       -2.00317870e-02,  9.35516300e-02, -1.25