In [4]:
!pip -qq install -U langchain-ollama

# Check Google Colab

In [1]:
try:
  import google.colab
  from google.colab import drive
  drive.mount('/content/drive')
  IN_COLAB = True
except:
  IN_COLAB = False
print(f"am I in Colab? {IN_COLAB}")

am I in Colab? False


# Imports

In [37]:
import os
import json
import pandas as pd
import re
from tqdm import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import TokenTextSplitter


from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain_ollama import OllamaLLM
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableLambda

# Model
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.embeddings import Embeddings

# Elasticsearch
from elasticsearch import Elasticsearch

# Settings

In [6]:
if IN_COLAB:
  # Colab
  BASE_DIR = "."
  BACKUPS_DATA_DIR   = "/content/drive/MyDrive/Colab Notebooks/Dataclub/llm/data"
else:
  # Local
  BASE_DIR = ".."
  BACKUPS_DATA_DIR   = "../backups"

# Raw directory
RAW_DATA_DIR               = f"{BASE_DIR}/data/raw"
RAW_DOCS_DATA_DIR          = f"{BASE_DIR}/data/raw/documents"
RAW_INFO_DATA_DIR          = f"{BASE_DIR}/data/raw/info"
# Preprocessing
PROCESSED_DATA_DIR         = f"{BASE_DIR}/data/processed"
PROCESSED_DOCS_DATA_DIR    = f"{BASE_DIR}/data/processed/documents"
# Indexing
INDEXING_DATA_DIR          = f"{BASE_DIR}/data/indexing"
INDEXING_DOCS_DATA_DIR     = f"{BASE_DIR}/data/indexing/documents"
# Test directory
TEST_DATA_DIR               = f"{BASE_DIR}/data/test"
TEST_OPTIMIZATION_DATA_DIR  = f"{BASE_DIR}/data/test/optimization"
GROUND_TRUTH_DATA_DIR       = f"{BASE_DIR}/data/test/ground_truth"
GROUND_TRUTH_DOCS_DATA_DIR  = f"{BASE_DIR}/data/test/ground_truth/documents"
GROUND_TRUTH_GEN_DATA_DIR   = f"{BASE_DIR}/data/test/ground_truth/generated"
# Config Prompts
PROMPTS_CONFIG_DIR = f"{BASE_DIR}/cooking_recipe_assistant/config/prompts"

# Raw Info
PLAYLIST_INFO_PATH = f"{RAW_INFO_DATA_DIR}/playlist_info.pkl"
VIDEO_PLAYLIST_MAP_PATH = f"{RAW_INFO_DATA_DIR}/video_playlist_map.pkl"

# Ground-truth
GROUND_TRUTH_PATH = f"{GROUND_TRUTH_DATA_DIR}/ground-truth-retrieval.csv"

# Optimization
REST_OPT_ES_BM25_PATH       = f"{TEST_OPTIMIZATION_DATA_DIR}/res-opt-es-bm25.json"
REST_OPT_ES_HYBRID_PATH     = f"{TEST_OPTIMIZATION_DATA_DIR}/res-opt-es-hybrid.json"
REST_OPT_ES_HYBRID_RRF_PATH = f"{TEST_OPTIMIZATION_DATA_DIR}/res-opt-es-hybrid-rrf.json"

# Make dirs if not exists
if not os.path.exists(RAW_DATA_DIR):
  print("Not exists dir: ", RAW_DATA_DIR)
os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(TEST_DATA_DIR, exist_ok=True)
os.makedirs(TEST_OPTIMIZATION_DATA_DIR, exist_ok=True)
os.makedirs(GROUND_TRUTH_DATA_DIR, exist_ok=True)
os.makedirs(BACKUPS_DATA_DIR, exist_ok=True)

In [4]:
%load_ext autoreload
%autoreload 2
import os
import sys

# Agregar solo si no está ya en sys.path
if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

# Ahora puedes importar tu módulo
import cooking_recipe_assistant
from cooking_recipe_assistant.commons.utils import (
    read_pickle, 
    read_document, 
    read_text,
    save_pickle, 
    save_document,
    save_text
)

from cooking_recipe_assistant.evaluation.retrievers import evaluate
from cooking_recipe_assistant.evaluation.optimization import run_hyperopt
from cooking_recipe_assistant.rags.retrievers.es_bm25 import es_bm25_query
from cooking_recipe_assistant.rags.retrievers.es_hybrid import es_hybrid_query
from cooking_recipe_assistant.rags.retrievers.es_hybrid_rrf import es_hybrid_rrf_query

In [5]:
# Elastic Search
ES_URL = "http://localhost:9200"
INDEX_NAME = "cooking-recipes"

In [7]:
ENTRY_TEMPLATE = read_text(f"{PROMPTS_CONFIG_DIR}/en_entry_template.txt")
print(ENTRY_TEMPLATE)

meals: {meals}
title: {title}
ingredients: {ingredients}
summary: {summary}
instructions: {text}
tips: {tips}


In [8]:
ES_CLIENT = Elasticsearch(hosts=[ES_URL])

In [9]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
MINILM_EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
#NMNET_EMBEDDING_MODEL_NAME = 'all-mpnet-base-v2'
BATCH_SIZE = 100

In [10]:
EMBEDDINGS = HuggingFaceEmbeddings(model_name=MINILM_EMBEDDING_MODEL_NAME)
#EMBEDDINGS = HuggingFaceEmbeddings(model_name=NMNET_EMBEDDING_MODEL_NAME)

In [11]:


# Crear el modelo OpenAI para el LLMChain
OLLAMA_URL = 'http://localhost:11434'
PLAYLIST_TITLE = 'Imperial Stout'
OPENAI_MODEL_NAME = 'gpt-4o-mini'
OLLAMA_MODEL_NAME = 'llama3'
#llm = OpenAI(model="gpt-4")
#llm = ChatOllama(model=OLLAMA_MODEL_NAME)

In [12]:
!ls -lh "{PROMPTS_CONFIG_DIR}"

total 68K
-rw-rw-r-- 1 aztleclan aztleclan  109 oct 27 01:30 en_entry_template.txt
-rw-rw-r-- 1 aztleclan aztleclan 1,4K oct 18 17:24 en_prompt_template_blocks.txt
-rw-rw-r-- 1 aztleclan aztleclan  831 oct 26 14:11 en_prompt_template_eval_rag_v1.txt
-rw-rw-r-- 1 aztleclan aztleclan  643 oct 25 17:50 en_prompt_template_eval_rag_v2.txt
-rw-rw-r-- 1 aztleclan aztleclan 1,6K oct 18 20:04 en_prompt_template_extractions.txt
-rw-rw-r-- 1 aztleclan aztleclan 1,6K oct 25 15:44 en_prompt_template_ground_truth_v1.txt
-rw-rw-r-- 1 aztleclan aztleclan 1,5K oct 25 15:47 en_prompt_template_ground_truth_v2.txt
-rw-rw-r-- 1 aztleclan aztleclan 1,2K oct 23 18:16 en_prompt_template_questions.txt
-rw-rw-r-- 1 aztleclan aztleclan  206 oct 24 16:33 en_prompt_template_rag_v1.txt
-rw-rw-r-- 1 aztleclan aztleclan  206 oct 26 15:25 en_prompt_template_rag_v2.txt
-rw-rw-r-- 1 aztleclan aztleclan  346 oct 25 02:00 en_prompt_template_system_assistent.txt
-rw-rw-r-- 1 aztleclan aztleclan 1,5K oct 18 17:24 es_prompt_

In [15]:
BASIC_COOKING_RECIPE_TEMPLATE = read_text(f"{PROMPTS_CONFIG_DIR}/en_prompt_template_rag_v1.txt")

In [16]:
print(BASIC_COOKING_RECIPE_TEMPLATE)

You are a cooking recipe asistente. Answer the QUESTION based on the CONTEXT of our recipe database. 
Use only the data in the CONTEXT when answering the QUESTION.

CONTEXT: 
{context}

QUESTION: {question}


# Check ElasticSearch

In [17]:
info_es = ES_CLIENT.info()
print(json.dumps(info_es.body, indent=4))

{
    "name": "3cfc0904bf39",
    "cluster_name": "docker-cluster",
    "cluster_uuid": "wg43N1DqSdy9g9z_pOLIDQ",
    "version": {
        "number": "8.4.3",
        "build_flavor": "default",
        "build_type": "docker",
        "build_hash": "42f05b9372a9a4a470db3b52817899b99a76ee73",
        "build_date": "2022-10-04T07:17:24.662462378Z",
        "build_snapshot": false,
        "lucene_version": "9.3.0",
        "minimum_wire_compatibility_version": "7.17.0",
        "minimum_index_compatibility_version": "7.0.0"
    },
    "tagline": "You Know, for Search"
}


In [18]:
if ES_CLIENT.indices.exists(index=INDEX_NAME):
    info_indice = ES_CLIENT.indices.get(index=INDEX_NAME)
    print(json.dumps(info_indice.body, indent=4))

{
    "cooking-recipes": {
        "aliases": {},
        "mappings": {
            "properties": {
                "chunk_number": {
                    "type": "integer"
                },
                "doc_id": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "id": {
                    "type": "text"
                },
                "ingredients": {
                    "type": "keyword"
                },
                "meals": {
                    "type": "keyword"
                },
                "summart_vector": {
                    "type": "float"
                },
                "summary": {
                    "type": "text"
                },
                "summary_vector": {
                    "type": "dense_vector",
        

In [19]:
if ES_CLIENT.indices.exists(index=INDEX_NAME):
    settings = ES_CLIENT.indices.get_settings(index=INDEX_NAME)
    print(json.dumps(settings.body, indent=4))

{
    "cooking-recipes": {
        "settings": {
            "index": {
                "routing": {
                    "allocation": {
                        "include": {
                            "_tier_preference": "data_content"
                        }
                    }
                },
                "number_of_shards": "1",
                "provided_name": "cooking-recipes",
                "creation_date": "1729698947472",
                "number_of_replicas": "0",
                "uuid": "xRPfDileSMin8UmdVwEoOQ",
                "version": {
                    "created": "8040399"
                }
            }
        }
    }
}


In [20]:
if ES_CLIENT.indices.exists(index=INDEX_NAME):
    count = ES_CLIENT.count(index=INDEX_NAME)['count']
    print(f"Count={count}")

Count=232


# Check Embedding

In [21]:
text = "LangChain is a framework for developing applications powered by language models."


embedding_vector = EMBEDDINGS.embed_query(text)
print(type(embedding_vector), len(embedding_vector))
print(embedding_vector[:10])

<class 'list'> 384
[-0.03306527063250542, -0.04929625988006592, 0.0011788202682510018, -0.052408862859010696, -0.037587061524391174, 0.025819718837738037, -0.03928518667817116, 0.05620156601071358, 0.0902889296412468, -0.052350059151649475]


# Simple RAG

In [33]:
from dotenv import load_dotenv
load_dotenv()

True

In [35]:
#os.environ["OPENAI_API_KEY"]

In [22]:
def es_retriever(
    query: str
):
    #print(f"query: {query}")
    search_results = es_bm25_query(
        es_client=ES_CLIENT,
        index_name=INDEX_NAME,
        query=query,
    )
    #return search_results
    context = build_recipe_context(search_results)
    return context

In [23]:
# Función para construir el contexto de los resultados de búsqueda
def build_recipe_context(search_results):
    context = ""
    for doc in search_results:
        context = context + ENTRY_TEMPLATE.format(**doc) + "\n\n"
        #context = context + doc["text"] + "\n\n"
    #print(context)
    return context.strip()

In [24]:
def build_llm(
    model_name:str
):
    # Build llm
    if model_name.startswith("llama"):
        llm = ChatOllama(model=model_name)
    elif model_name.startswith("gpt"):
        llm = ChatOpenAI(model_name=model_name)
    else:
        raise Exception(f"Not found model name: {model_name}")
    return llm

In [25]:


def create_chain(model_name:str, template:str):

    # Build Prompt
    #prompt_template = PromptTemplate(
    #    input_variables=["question", "context"],
    #    template=template,
    #)
    prompt = ChatPromptTemplate.from_template(
        template=template
    )

    # LLM
    llm = build_llm(model_name)

    #memory = ConversationBufferMemory(llm=llm, memory_key="chat_history", return_messages=True, output_key='answer')

    #qa = ConversationalRetrievalChain.from_llm(llm, retriever=retv , memory=memory,
    #                                           return_source_documents=True)

    qa = (
        #RunnableLambda(es_retriever)
        #| {"context": build_recipe_context, "question": RunnablePassthrough()}
         {"context": es_retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    return qa

In [26]:
# Función RAG con LangChain
def rag(query):
    search_results = search(query)
    context = build_context(search_results)
    print(type(query))
    print(query)
    print(type(context))
    print(context)
    
    # Ejecutar el chain con los valores de entrada
    answer = chain.invoke({
        "question": query,
        "context": context
    })
    
    return answer

In [38]:

df_ground_truth = pd.read_csv(GROUND_TRUTH_PATH)

In [39]:
row_id = 30
query = df_ground_truth.iloc[row_id].question
doc_id = df_ground_truth.iloc[row_id].id
print(f"Query : {query}")
print(f"Doc_id: {doc_id}")

Query : What is the best way to cook the potatoes for the Potato and Ground Meat Pie, given that I need to leave the skin on?
Doc_id: lBqnLPKRLuQ@000


In [44]:
#qa_ollama_chain = create_chain(OLLAMA_MODEL_NAME, BASIC_COOKING_RECIPE_TEMPLATE)
qa_chain = create_chain(OPENAI_MODEL_NAME, BASIC_COOKING_RECIPE_TEMPLATE)

In [45]:
prompt_answers = qa_chain.invoke(query)

In [46]:
print(prompt_answers)

For the Potato and Ground Meat Pie, the best way to cook the potatoes while leaving the skin on is to start by washing the potatoes thoroughly. Then, place them in a pot of water and bring it to a boil over high heat. Once boiling, reduce the heat to medium and cook for about 30 minutes, or until the potatoes are tender. To check for doneness, pierce them with a fork; if it goes in easily, they are ready. After cooking, drain the potatoes and let them cool before peeling.


In [48]:
row_id = 20
query = df_ground_truth.iloc[row_id].question
doc_id = df_ground_truth.iloc[row_id].id
print(f"Query : {query}")
print(f"Doc_id: {doc_id}")

Query : What are the exact cooking times for the Delicious Cabracho Cake for Christmas, specifically for the fish and onion preparation?
Doc_id: vJ55fsr81yw@000


In [49]:
#results = rag_bm25(query, llm, template=BASIC_COOKING_RECIPE_TEMPLATE)
q20_answer = qa_chain.invoke(query)

In [50]:
print(q20_answer)

For the Delicious Cabracho Cake for Christmas, the exact cooking times for the fish and onion preparation are as follows:

- Cook the cabracho fish over medium heat for about **one and a half minutes on each side**.
- Cook the onion over medium heat for about **10 to 15 minutes**.
