Ref: https://python.langchain.com/docs/tutorials/rag/

### Load model

In [None]:
from dotenv import load_dotenv
load_dotenv()

# https://python.langchain.com/docs/integrations/chat/huggingface/#huggingfacepipeline
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.2-3B-Instruct",
    task="text-generation",
    temperature=0,
    # max_new_tokens=512,
    # do_sample=False,
    # repetition_penalty=1.03,
)

llm = ChatHuggingFace(llm=llm)


### Data loading

In [None]:
from langchain_community.document_loaders import PyPDFLoader

# https://python.langchain.com/docs/integrations/document_loaders/pypdfloader/#add-a-custom-pages_delimiter-to-identify-where-are-ends-of-pages-in-single-mode
loader = PyPDFLoader(
    "./assets/data/C.-G.-Jung-Collected-Works-Volume-9i_-The-Archetypes-of-the-Collective-Unconscious.pdf",
    mode="single",
    pages_delimiter="\n-------THIS MARKS THE END OF THIS PAGE-------\n",
)
docs = loader.load()
print(len(docs))
# assert len(docs) == 1

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(
    file_path="assets/data/Mental Health Dataset.csv",
    csv_args={
        "delimiter": ",",
        "fieldnames": ["Gender", "Country", "Occupation", "self_employed", "family_history", "treatment", "Days_Indoors", "Growing_Stress", "Changes_Habits", "Mental_Health_History", "Mood_Swings", "Coping_Struggles", "Work_Interest", "Social_Weakness", "mental_health_interview", "care_options"],
    },
)

mental_health_data = loader.load()
print(len(mental_health_data))
print(mental_health_data[1])

In [None]:
import json

# file_path = "assets/data/jstorsample.jsonl"

# dream_json = {}
# with open(file_path, "r", encoding="utf-8") as f:
#     for i, line in enumerate(f):
#         entry = json.loads(line.strip())
#         # if "dream_description" in entry:
#         # content = f"Dream: {entry['dream_description']}"
#         # docs.append(entry)
#         dream_json[str(i)] = entry

# dump_path = "assets/data/jsonl_parsed.json"
# with open(dump_path, "w") as f:
#     json.dump(dream_json, f)

from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="assets/data/jstorsample.jsonl",
    jq_schema=".",
    json_lines=True,
    is_content_key_jq_parsable=False,
    text_content=False,
)

dream_docs = loader.load()
print(len(dream_docs))

### Text splitting, embedding and indexing

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
pdf_splits = text_splitter.split_documents(docs)
# csv_splits = text_splitter.split_documents(mental_health_data)
jsonl_splits = text_splitter.split_documents(dream_docs)



In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings # local
from langchain_core.vectorstores import InMemoryVectorStore
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={"device": "cuda"},
)

In [None]:
# Create In-Memory Vector Stores
# mental_health_store = InMemoryVectorStore(embeddings)
jung_store = InMemoryVectorStore(embeddings)
dream_store = InMemoryVectorStore(embeddings)


In [None]:
# Add embeddings to the stores
# mental_health_store.add_documents(documents=csv_splits)
# print("mental health done")
jung_store.add_documents(documents=pdf_splits)
print("jung book done")
dream_store.add_documents(documents=jsonl_splits)
print("jsonl done")

### Information retrieval and generation

In [None]:
from langchain_core.prompts import ChatPromptTemplate
import textwrap

def interpret_dream_rag(dream_text):
    # Retrieve context from all sources
    # mental_health_results = mental_health_retriever.get_relevant_documents(dream_text)
    jung_results = jung_store.similarity_search(dream_text)
    dream_results = dream_store.similarity_search(dream_text)

    context = "\n\n".join([doc.page_content for doc in (jung_results + dream_results)])

    prompt = ChatPromptTemplate.from_template(
        """User's Dream: {dream_text}

    Context from Jungian Archetypes, and Dream History:
    {context}

    Generate a deep and meaningful dream interpretation based on these insights.
    Answer:"""
    )

    chain = prompt | llm
    answer = chain.invoke({"dream_text": dream_text, "context": context})

    return answer

print(textwrap.fill(interpret_dream_rag("I dreamt of falling off of a bridge").content, width=120))
