<a href="https://colab.research.google.com/github/Diangelion/tourism-recommender/blob/main/tourism_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install -qU deep-translator

In [None]:
# import pandas as pd
# from deep_translator import GoogleTranslator

# # Inisialisasi translator
# translator = GoogleTranslator(source='indonesian', target='english')

# # Membaca file paragraf
# paragraphs = pd.read_csv("/content/drive/MyDrive/LLM/Rekomendasi Tempat Wisata di Indonesia/Dataset/paragraphs_id.csv")

# # List untuk menyimpan hasil terjemahan
# translated_list = []

# # Iterasi dan terjemahkan setiap paragraf
# for _, row in paragraphs.iterrows():
#     paragraph = row["Paragraph"]
#     try:
#         translated_paragraph = translator.translate(text=paragraph)
#     except Exception as e:
#         translated_paragraph = f"[TRANSLATION ERROR] {e}"
#     translated_list.append(translated_paragraph)

# # Buat DataFrame baru dari hasil terjemahan
# translated_df = pd.DataFrame({'Paragraph': translated_list})

# # Simpan ke file CSV
# translated_df.to_csv("/content/drive/MyDrive/LLM/Rekomendasi Tempat Wisata di Indonesia/Dataset/paragraphs_en.csv", index=False)

# START

In [None]:
!pip install -qU \
    langchain-community \
    langchain-chroma \
    langchain-huggingface \
    hf_xet langchain-ollama

In [None]:
#=======================
# Libraries
#=======================
import os
import shutil
import pandas as pd
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document

#=======================
# Configuration
#=======================
CONFIG = {
    "data_path": "/content/drive/MyDrive/LLM/tourism-recommender/dataset/paragraphs_en.csv",
    "chroma_path": "/content/drive/MyDrive/LLM/tourism-recommender/dataset/tourism_chroma_db",
    "chunk_size": 500,
    "chunk_overlap": 200,
    "embedding_model_name": "sentence-transformers/all-mpnet-base-v2",
    "embedding_model_kwargs": {"device": "cpu"},
    "embedding_encode_kwargs": {"normalize_embeddings": False},
    "RESET_DB": False
}

#=======================
# Reset DB
#=======================
if CONFIG["RESET_DB"]:
  shutil.rmtree(CONFIG["chroma_path"], ignore_errors=True)

#=======================
# Data Loading
#=======================
def load_data(path):
    df = pd.read_csv(path, header=None, names=['text'])
    return '\n'.join(df['text'].tolist())

#=======================
# Text Processing
#=======================
def split_text(text):
    splitter = CharacterTextSplitter(
        separator=" ",
        chunk_size=CONFIG['chunk_size'],
        chunk_overlap=CONFIG['chunk_overlap']
    )
    documents = []
    ids = []
    chunks = splitter.split_text(text)
    for i, chunk in enumerate(chunks):
      chunk_id = f"chunk-{i}"
      metadata = {
        "chunk_id": chunk_id,
        "chunk_index": i
      }
      documents.append(Document(page_content=chunk, metadata=metadata))
      ids.append(chunk_id)
    return documents, ids

#=======================
# Vector Store Setup
#=======================
def initialize_vector_store(docs, ids):
  embeddings = HuggingFaceEmbeddings(
      model_name=CONFIG["embedding_model_name"],
      model_kwargs=CONFIG["embedding_model_kwargs"],
      encode_kwargs=CONFIG["embedding_encode_kwargs"],
      show_progress=True
  )

  if os.path.exists(CONFIG['chroma_path']) and os.path.isdir(CONFIG['chroma_path']):
    vector_store = Chroma(
        persist_directory=CONFIG["chroma_path"],
        embedding_function=embeddings
    )
    print(f"Vector store loaded from {CONFIG['chroma_path']}")
  else:
    vector_store = Chroma.from_documents(
        ids=ids,
        documents=docs,
        embedding=embeddings,
        persist_directory=CONFIG["chroma_path"]
    )
    print(f"Vector store saved to {CONFIG['chroma_path']}")

  return vector_store.as_retriever(search_kwargs={ "k": 10 })

def format_docs_with_metadata(docs):
  formatted = []
  for i, doc in enumerate(docs):
    metadata = doc.metadata
    data_id = metadata.get("chunk_id", "Unknown ID")
    data_idx = metadata.get("chunk_index", "Unknown Index")
    formatted.append(
        f"Document {i+1} (ID: {data_id}, Index: {data_idx}):\n{doc.page_content}"
    )
    # formatted.append(doc.page_content)
  return "\n\n".join(formatted)

#=======================
# LLM Setup
#=======================
def initialize_llm(message: str):
  from transformers import AutoModelForCausalLM, AutoTokenizer
  model_name = "Qwen/Qwen3-1.7B"
  # load the tokenizer and the model
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      torch_dtype="auto",
      device_map="auto"
  )
  # prepare the model input
  messages = [{"role": "user", "content": message}]
  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
      enable_thinking=False
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
  # conduct text completion
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=32768
  )
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
  # parsing thinking content
  try:
      # rindex finding 151668 (</think>)
      index = len(output_ids) - output_ids[::-1].index(151668)
  except ValueError:
      index = 0
  thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
  content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
  return content

#=======================
# RAG Pipeline
#=======================
def create_rag_pipeline(retriever):
  system_template = (
      """
      You are an expert travel assistant for Indonesia.
      Use this context to answer questions:
      {context}
      """
  )

  prompt = ChatPromptTemplate.from_messages([
      ("system", system_template),
      ("human", "{question}")
  ])

  retrieval_chain = (
      {
          "context": lambda x: format_docs_with_metadata(retriever.invoke(x["question"])),
          "question": lambda x: x["question"]
      }
      | prompt
      | (lambda x: x.messages[0].content + "\nQuestion: " + x.messages[1].content)
      | (lambda x: initialize_llm(x))
      | StrOutputParser()
  )

  return retrieval_chain

#=======================
# Main Execution
#=======================
if __name__ == "__main__":
  raw_text = load_data(CONFIG["data_path"])
  docs, ids = split_text(raw_text)
  vector_store = initialize_vector_store(docs, ids)
  qa_chain = create_rag_pipeline(vector_store)

  print("Welcome to Indonesia Travel Assistant! Type 'exit' to end.")
  user_input = "Bring me the best and cheapest place to visit in Surabaya related to history! Tell me the reason too"
  try:
    result = qa_chain.invoke({ "question": user_input })
    print(f"\n💡 Answer: {result}")
  except Exception as e:
    print(f"\n🚨 Error: {str(e)}")

  # while True:
  #   user_input = input("\nYour question: ").strip()
  #   if user_input.lower() in ['exit', 'quit']:
  #     break

  #   try:
  #     result = qa_chain.invoke({ "question": user_input })
  #     print(f"\n💡 Answer: {result}")
  #   except Exception as e:
  #     print(f"\n🚨 Error: {str(e)}")

print("\nThank you for using the travel assistant!")