In [None]:
import langchain
import chromadb
import unstructured
import pypdf
import fitz  
import os
import numpy

In [None]:
import fitz  
import os

pdf_files = [' file6.pdf', ' file7.pdf']

output_folder = "extracted_images"

os.makedirs(output_folder, exist_ok=True)
image_count = 0

for pdf_path in pdf_files:
    doc = fitz.open(pdf_path)
    print(f" Processing {pdf_path}...")

    for page_num in range(len(doc)):
        page = doc[page_num]
        images = page.get_images(full=True)

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            file_name = os.path.basename(pdf_path).replace(".pdf", "")
            image_filename = f"{output_folder}/{file_name}_page{page_num+1}_img{img_index+1}.{image_ext}"

            with open(image_filename, "wb") as f:
                f.write(image_bytes)
            image_count += 1

print(f"\n Extracted {image_count} image(s) from file6.pdf and file7.pdf into {output_folder}/")


In [None]:
from langchain_community.document_loaders import PyPDFLoader
import os

pdf_folder = "/Users/apple/Documents/GitHub/Marbet_Challenge/rag_bot"
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

output_folder = os.path.join(pdf_folder, "extracted_texts")
os.makedirs(output_folder, exist_ok=True)

for filename in pdf_files:
    pdf_path = os.path.join(pdf_folder, filename)
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    combined_text = "\n\n".join([doc.page_content.strip() for doc in docs])

    txt_filename = os.path.splitext(filename)[0] + ".txt"
    output_path = os.path.join(output_folder, txt_filename)

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(combined_text)

    print(f" Clean text saved: {txt_filename}")


In [None]:
folder_path = "/Users/apple/Documents/GitHub/Marbet_Challenge/rag_bot/extracted_texts"

file_names = [
    "guest_WiFi_access.txt",
    "A-Z_Listing.txt",
    "Checklist.txt",
    "SPA_services.txt",
    "application_process.txt",
    "entry_Canada.txt"
]

output_path = os.path.join(folder_path, "combined_txt.txt")

with open(output_path, "w", encoding="utf-8") as outfile:
    for fname in file_names:
        file_path = os.path.join(folder_path, fname)
        with open(file_path, "r", encoding="utf-8") as infile:
            outfile.write(f"\n\n--- {fname.replace('.txt','')} ---\n\n")  
            outfile.write(infile.read())
            outfile.write("\n")

print("All files merged into combined_txt.txt")

In [2]:
from langchain_ollama import ChatOllama
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain


In [20]:
model = ChatOllama(
    base_url="http://194.171.191.226:3061",
    model="llama3.2",
    temperature=0.7,
    num_predict=512,
    repeat_penalty=1.2,
)

messages = [
    ("system",
     "You are the AI core of **Marbet Bot**, a Python service that "
     "analyzes sports odds and generates betting tips. "
     "Always:\n"
     " • explain in short, clear paragraphs\n"
     " • use bullet lists for steps or data outputs\n"
     " • speak as if you know Marbet Bot’s codebase and architecture\n"
     " • ask for missing details about data sources or endpoints\n")
]

ai_msg = model.invoke(messages)
print(ai_msg)

content='' additional_kwargs={} response_metadata={'model': 'llama3.2', 'created_at': '2025-04-25T08:33:50.295362666Z', 'done': True, 'done_reason': 'stop', 'total_duration': 64298691, 'load_duration': 49209994, 'prompt_eval_count': 85, 'prompt_eval_duration': 12861000, 'eval_count': 1, 'eval_duration': 22000, 'model_name': 'llama3.2'} id='run-ff8f6906-0fe6-4b65-bfeb-96c130ea1424-0' usage_metadata={'input_tokens': 85, 'output_tokens': 1, 'total_tokens': 86}


In [21]:
filepath = "/Users/apple/Documents/GitHub/Marbet_Challenge/rag_bot/combined_txt.txt"
loader = TextLoader(filepath)
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
content = splitter.split_documents(docs)


In [22]:
embedding = OllamaEmbeddings(model="mxbai-embed-large:latest")
vector_store = DocArrayInMemorySearch.from_documents(documents=content, embedding=embedding)
retriever = vector_store.as_retriever()


In [23]:
template = """
You are an assistant for guests attending a Marbet event.

Only use the provided context to answer the question. Be concise and to the point.
Do not guess or make up information. 
If the answer is not in the context, say:
"I don't know. Please contact +49 791 49380 100 or info@marbet.com. Our website: www.marbet.com — the team will be happy to help you."

Context:
{context}

Question: {question}
Answer (max 2-3 sentences):
"""

prompt = PromptTemplate(input_variables=["context", "question"], template=template)
chain = LLMChain(llm=model, prompt=prompt)


In [24]:
questions = [
    "how to alert the medical team?",
    "where is library?",
    "where is the gym?",
    "where is the pool?"
]

for q in questions:
    retrieved_docs = retriever.invoke(q)
    context = "\n".join([doc.page_content for doc in retrieved_docs])

    if not retrieved_docs or context.strip() == "":
        print(f"Question: {q}\nAnswer: I don't know. Please contact +49 791 49380 100 or info@marbet.com. Website: www.marbet.com\n")
    else:
        response = chain.invoke({"context": context, "question": q})
        print(f"Question: {q}\nAnswer: {response}\n")


Question: how to alert the medical team?
Answer: {'context': 'A\n\nAlarm  \nMedical emergency: To alert a medical team, there is an alarm button on the phone in your suite.  \nGeneral Alarm: As soon as the general alarm sounds, please follow the instructions and announcements on board.\nImportant notes\n\nThe general emergency signal consists of 7 short and one long tone, which sounds over the ship\'s loudspeaker system. As soon as you hear this signal, even if the yacht is in port, please return to your cabin immediately. Please put on your lifejacket and warm clothing (Personal Survival Kit) and go to your assembly point, which is also indicated on your door.\nFirst aid  \nIf you need first aid or other medical assistance, please contact Guest Services, the medical center on deck 3 or one of the crew members.\n\nF\n\nBinoculars  \nAll suites are equipped with binoculars for use during the cruise equipped. Please bring these to return to your suite at the end of the journey.\nSee [tra