In [1]:
import requests
import json
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
import chromadb
from chromadb.config import Settings
import os


api_key = os.getenv("HUGGING_FACE_API_KEY")

In [2]:
# from langchain.embeddings import HuggingFaceHubEmbeddings
# from langchain.document_loaders import PyPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter


# def embed_pdf(pdf_path, api):
#     # 1. Load PDF  LangChain DocumentLoader
#     loader = PyPDFLoader(pdf_path)
#     documents = loader.load()

#     # 2. Konfigurasi HuggingFace Embeddings
#     hf_embeddings = HuggingFaceHubEmbeddings(
#         # Ganti dengan token API Anda
#         huggingfacehub_api_token=api_key
#     )

#     # 3. splittingText
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

#     # 4. Generate embeddings
#     all_embeddings = []
#     for doc in documents:
#         chunks = text_splitter.split_text(doc.page_content)
#         embeddings = [hf_embeddings.embed_query(chunk) for chunk in chunks]
#         all_embeddings.extend(embeddings)
    
#     texts = [chunk for doc in documents for chunk in text_splitter.split_text(doc.page_content)]

#     print(
#         f"Berhasil memproses {len(documents)} halaman dari PDF dan menghasilkan {len(all_embeddings)} embeddings.")
#     return texts, all_embeddings


# pdf_path = "./pdf/what-is-generative-ai.pdf"
# texts, embeddings = embed_pdf(pdf_path, api_key)

In [3]:
from langchain.embeddings import HuggingFaceHubEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def embed_pdf(pdf_path, api_key):
    """
    Membaca PDF, membagi teks menjadi chunk, dan melakukan embedding menggunakan HuggingFaceHubEmbeddings.
    """
    # 1. Load PDF dengan LangChain DocumentLoader
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    # Gabungkan teks dari semua halaman
    combined_text = " ".join([doc.page_content for doc in documents])

    # 2. Text Splitter untuk membagi teks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_text(combined_text)

    # 3. HuggingFaceHub Embeddings
    hf_embeddings = HuggingFaceHubEmbeddings(
        huggingfacehub_api_token=api_key  # Ganti dengan token API Anda
    )

    # 4. Generate embeddings
    embeddings = [hf_embeddings.embed_query(chunk) for chunk in chunks]

    print(f"Berhasil memproses PDF dan menghasilkan {len(chunks)} embeddings.")
    return chunks, embeddings


# Contoh penggunaan
pdf_path = "./pdf/what-is-generative-ai.pdf"
texts, embeddings = embed_pdf(pdf_path, api_key)

  hf_embeddings = HuggingFaceHubEmbeddings(


Berhasil memproses PDF dan menghasilkan 31 embeddings.


In [4]:
embeddings

[[0.012533633969724178,
  0.019134879112243652,
  -0.037956640124320984,
  -0.058241475373506546,
  -0.06099347397685051,
  0.011050425469875336,
  0.03045693226158619,
  0.03245551884174347,
  0.028125585988163948,
  0.0003992808051407337,
  0.04311973601579666,
  0.01330845057964325,
  -0.026705743744969368,
  0.061459414660930634,
  0.04119594395160675,
  -0.07090868055820465,
  0.023656977340579033,
  -0.002657208824530244,
  -0.0020522919949144125,
  -0.021724088117480278,
  -0.03979373350739479,
  0.01669144071638584,
  -0.01146348100155592,
  0.05043932422995567,
  -0.07954012602567673,
  -0.07778148353099823,
  0.006548003293573856,
  -0.013856688514351845,
  -0.017036130651831627,
  -0.010513318702578545,
  -0.023823672905564308,
  -0.005813556257635355,
  0.00024434959050267935,
  0.0516364760696888,
  2.0401178062456893e-06,
  -0.05325966700911522,
  0.004281196277588606,
  0.008701316080987453,
  -0.011868957430124283,
  -0.021700004115700722,
  0.06256937235593796,
  0.043

In [6]:
from langchain.vectorstores import Chroma
from langchain.schema import Document
from chromadb.config import Settings


def store_vector(texts, embeddings, collection_name="generative_ai"):
    """
    Menyimpan teks dan embedding ke ChromaDB menggunakan LangChain.
    """
    # Inisialisasi ChromaDB Client
    settings = Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory="./chromadb"  # Lokasi penyimpanan database
    )
    vector_store = Chroma(
        collection_name=collection_name,
        embedding_function=None,  # Embedding sudah dihitung sebelumnya
        persist_directory="./chromadb",
        client_settings=settings
    )

    # Tambahkan teks dan embeddings ke ChromaDB
    vector_store.add_texts(
        texts=texts,
        metadatas=[{"chunk_id": f"chunk-{i}"} for i in range(len(texts))],
        ids=[f"chunk-{i}" for i in range(len(texts))],
        embeddings=embeddings
    )

    # Simpan perubahan ke disk
    vector_store.persist()
    print(f"Data berhasil disimpan ke koleksi: {collection_name}")


# Contoh penggunaan
store_vector(texts, embeddings, collection_name="generative_ai")

ValueError: [91mYou are using a deprecated configuration of Chroma.

[94mIf you do not have data you wish to migrate, you only need to change how you construct
your Chroma client. Please see the "New Clients" section of https://docs.trychroma.com/deployment/migration.
________________________________________________________________________________________________

If you do have data you wish to migrate, we have a migration tool you can use in order to
migrate your data to the new Chroma architecture.
Please `pip install chroma-migrate` and run `chroma-migrate` to migrate your data and then
change how you construct your Chroma client.

See https://docs.trychroma.com/deployment/migration for more information or join our discord at https://discord.gg/8g5FESbj for help![0m