## Initial setup

In [83]:
import dotenv

from llmware.configs import LLMWareConfig, ChromaDBConfig
from llmware.models import ModelCatalog
from llmware.library import Library
from llmware.retrieval import Query
from llmware.prompts import Prompt
from llmware.embeddings import EmbeddingChromaDB

In [67]:
# Load environment variables
dotenv.load_dotenv()

# Set default home directory (may cause some issues)
# https://github.com/llmware-ai/llmware/issues/919
# LLMWareConfig().set_home('data')

# Set log verbosity
LLMWareConfig().set_config('debug_mode', 2)

In [68]:
# Query (context) for searching knowledge on our documents
# The most relevant results (similarity) will be passed to RAG prompt
# The context usually should be a keyword/topic, but question may also work
input_query = 'fufufafa'
# The actual prompt question/input
input_prompt = 'Is Gibran behind fufufafa?'

## Prepare library

And add documents to that library

In [86]:
# # Download data
# r = requests.get(url)
# with open('data/files.zip', 'wb') as f:
#     f.write(r.content)

# # Extract data
# with zipfile.ZipFile('data/files.zip', 'r') as f:
#     f.extractall('data/files')

In [69]:
# Set database engine to save library data
# If using SQLite, db will be saved to "accounts" folder
LLMWareConfig().set_active_db('sqlite')

try:
    # Delete library if already exist to ensure fresh start
    Library().delete_library(
        library_name = 'fufufafa',
        account_name = 'dhika',
        confirm_delete = True
    )
except: pass

lib = Library().create_new_library(
    library_name = 'fufufafa',
    account_name = 'dhika'
)

# Folder containing files to add to library
lib_folder = 'data/files'

# There is also "add_website" function
# But can cause rate limit and IP blacklist
lib.add_files(
    # Files will be copied to "uploads" folder once added to library
    input_folder_path = lib_folder,
    # Min characters per embedding block
    # Usually separated per X sentences
    chunk_size = 400,
    # Max characters per embedding block
    max_chunk_size = 600
)

[37mINFO: update:  Duplicate files (skipped): 0[39m
[37mINFO: update:  Total uploaded: 10[39m
[37mINFO: Parser - parse_text file - processing - 1.txt[39m
[37mINFO: Parser - parse_text file - processing - 10.txt[39m
[37mINFO: Parser - parse_text file - processing - 2.txt[39m
[37mINFO: Parser - parse_text file - processing - 3.txt[39m
[37mINFO: Parser - parse_text file - processing - 4.txt[39m
[37mINFO: Parser - parse_text file - processing - 5.txt[39m
[37mINFO: Parser - parse_text file - processing - 6.txt[39m
[37mINFO: Parser - parse_text file - processing - 7.txt[39m
[37mINFO: Parser - parse_text file - processing - 8.txt[39m
[37mINFO: Parser - parse_text file - processing - 9.txt[39m


{'docs_added': 10,
 'blocks_added': 41,
 'images_added': 0,
 'pages_added': 10,
 'tables_added': 0,
 'rejected_files': []}

## Prepare embedding model

And also the database engine to store the vector

In [70]:
# ModelCatalog().list_embedding_models()

In [71]:
# Database engine to save vector embedding
# If using ChromaDB, db will be saved to "accounts" folder
LLMWareConfig().set_vector_db('chromadb')

# Model will be downloaded from Hugging Face if not exist yet
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
lib.install_new_embedding(
    embedding_model_name = 'mini-lm-sbert',
    batch_size = 100
)

print('Vector database path:', ChromaDBConfig().get_config("persistent_path"))
# Get embedding status and our documents
lib.get_library_card()

[37mINFO: update: embedding_handler - ChromaDB - Embeddings Created: 41 of 41[39m
[37mINFO: update: EmbeddingHandler - ChromaDB - embedding_summary - {'embeddings_created': 41, 'embedded_blocks': 41, 'embedding_dims': 384, 'time_stamp': '2024-10-14_113127'}[39m


Vector database path: C:\Users\Dhika\llmware_data\accounts\


{'_id': 1,
 'library_name': 'fufufafa',
 'embedding': [{'embedding_status': 'yes',
   'embedding_model': 'mini-lm-sbert',
   'embedding_db': 'chromadb',
   'embedding_dims': 384,
   'embedded_blocks': 41,
   'time_stamp': '2024-10-14_113127'}],
 'knowledge_graph': 'no',
 'unique_doc_id': 10,
 'documents': 10,
 'blocks': 41,
 'images': 0,
 'pages': 10,
 'tables': 0,
 'account_name': 'dhika'}

In [72]:
# Query with our library as source
query = Query(lib)

# Query (search) something on library
# The knowledge will be passed to our prompt later
results = query.semantic_query(
    query = input_query,
    result_count = 20
)

for result in results:
    print('Text:', repr(result['text']))
    # Chunk size (sliced text from long document)
    # Each chunk will be saved to a unique block id
    # Chunk can also be passed to RAG prompt as knowledge later
    print('Text chunk/character size:', len(result['text']))
    # Block id on the database
    print('Block (chunk) id:',result['block_ID'])
    print('File source:',result['file_source'])
    print('Page number:', result['page_num'])
    print('Vector distance:', result['distance'], '\n')

Text: ' The comment history of this account, allegedly belonging to President Joko "Jokowi" Widodo\'s eldest son, dated back to before the 2014 election and extended to 2019 when the rivalry between Prabowo and Jokowi peaked.\n\nAn X user has even compiled a collection of Fufufafa\'s digital footprints. In this thread, the netizen claimed that Gibran allegedly used the usernames Raka Gnarly and Fufufafa on Kaskus.\n\nMany netizens shared several screenshots of uploads made by the Fufufafa account in the thread.'
Text chunk/character size: 503
Block (chunk) id: 1
File source: 1.txt
Page number: 1
Vector distance: 0.2632857859134674 

Text: '\nA former government buzzer and two close associates of the Solo family—a reference to Jokowi’s family—who were familiar with the operation of the Fufufafa account, stated that the account was operated by a team working under the account’s owner. The team received content orders from the owner, who sometimes operated the account themselves.\n\nMeanw

In [73]:
# Get the top-N query results only
# Ordered from the lowest vector distance
# Fewer results = faster response
top_n = 5
results_top = sorted(results, key = lambda x: x['distance'], reverse = True)
results_top = results_top[:top_n]

for result in results_top:
    print('Text:', repr(result['text']))
    print('Vector distance:', result['distance'], '\n')

Text: ' Before and during his time as Mayor of Solo, Tempo had exchanged messages with Gibran via the same phone number.\n\nDuring a visit to Solo, Central Java, on September 10, 2024, Gibran refused to comment on the Fufufafa account controversy. "I don\'t know, ask the account owner. Why ask me?" Several sources close to the Presidential Palace revealed that Gibran had repeatedly denied that the account was his.'
Vector distance: 0.5310319066047668 

Text: " She emphasized the need to verify the account's ownership.\n\n“It's not certain yet. I don't know if it belongs to him or not. Make sure of it first,” said Grace at the Presidential Palace complex in Jakarta on Tuesday, September 3, 2024, after a meeting on Govtech with President Jokowi and staffers."
Vector distance: 0.4948948919773102 

Text: "\nDuring the meeting, Prabowo discussed improving relations between the two countries across various sectors, particularly in defense. The Emir of Qatar also expressed his desire for the 

### Misc embedding behind-the-scene
Simplified process of the `semantic_query` function

For education purpose only, the cell below can be deleted and it won't affect anything

In [74]:
# query.embedding_model = llmware.models.HFEmbeddingModel
# query.embedding_model.tokenizer = BertTokenizerFast
# query.embedding_model.model = BertModel with BertEmbeddings and BertEncoder

# Bert tokenizer using our embedding model
tokenized_text = query.embedding_model.tokenizer(list(input_query))
# Create embedding vector using 
embedded_vector = query.embedding_model.embedding(input_query)

print(f'Input text (length: {len(input_query)}):', repr(input_query))
print(f'Tokenized text (length: {len(tokenized_text["input_ids"])}):', tokenized_text['input_ids'])
print('Embedded vector shape:', repr(embedded_vector.shape))

EmbeddingChromaDB(lib, query.embedding_model).search_index(
    embedded_vector[0]
)

Input text (length: 8): 'fufufafa'
Tokenized text (length: 8): [[101, 1042, 102], [101, 1057, 102], [101, 1042, 102], [101, 1057, 102], [101, 1042, 102], [101, 1037, 102], [101, 1042, 102], [101, 1037, 102]]
Embedded vector shape: (1, 384)


[({'_id': 2,
   'block_ID': 1,
   'doc_ID': 1,
   'content_type': 'text',
   'file_type': 'txt',
   'master_index': 1,
   'master_index2': 0,
   'coords_x': 0,
   'coords_y': 0,
   'coords_cx': 0,
   'coords_cy': 0,
   'author_or_speaker': '',
   'added_to_collection': '2024-10-14_113114',
   'file_source': '1.txt',
   'table': '',
   'modified_date': '',
   'created_date': '',
   'creator_tool': '',
   'external_files': '',
   'text': ' The comment history of this account, allegedly belonging to President Joko "Jokowi" Widodo\'s eldest son, dated back to before the 2014 election and extended to 2019 when the rivalry between Prabowo and Jokowi peaked.\n\nAn X user has even compiled a collection of Fufufafa\'s digital footprints. In this thread, the netizen claimed that Gibran allegedly used the usernames Raka Gnarly and Fufufafa on Kaskus.\n\nMany netizens shared several screenshots of uploads made by the Fufufafa account in the thread.',
   'header_text': '',
   'text_search': ' The c

## Prepare model

In [75]:
# ModelCatalog().list_generative_local_models()

In [80]:
# Auto-download model first if necessary
# https://huggingface.co/collections/llmware
model_name = 'llmware/bling-tiny-llama-v0'

# Do not load model again if already declared
# Prevent out of memory if the cell is executed multiple times
if 'prompter' not in globals():
    # Temperature 0 = more deterministic, temperature 1 = more creative
    prompter = Prompt().load_model(model_name, temperature = 0.3)

# Forget previous source material (if already added)
# In case we want to ask different topic
prompter.clear_source_materials()

prompter.add_source_query_results(results_top)
responses = prompter.prompt_with_source(
    prompt = input_prompt,
    prompt_name = 'default'
)

responses

[{'llm_response': 'No, Gibran is not behind the Fufufafa account.',
  'prompt': 'Is Gibran behind fufufafa?',
  'evidence': ' Before and during his time as Mayor of Solo, Tempo had exchanged messages with Gibran via the same phone number.\n\nDuring a visit to Solo, Central Java, on September 10, 2024, Gibran refused to comment on the Fufufafa account controversy. "I don\'t know, ask the account owner. Why ask me?" Several sources close to the Presidential Palace revealed that Gibran had repeatedly denied that the account was his.\n She emphasized the need to verify the account\'s ownership.\n\n“It\'s not certain yet. I don\'t know if it belongs to him or not. Make sure of it first,” said Grace at the Presidential Palace complex in Jakarta on Tuesday, September 3, 2024, after a meeting on Govtech with President Jokowi and staffers.\n\nDuring the meeting, Prabowo discussed improving relations between the two countries across various sectors, particularly in defense. The Emir of Qatar als

In [81]:
for resp in responses:
    print('Prompt:', repr(resp['prompt']))
    print('Response:', repr(resp['llm_response']))
    print('Evidence/knowledge:', repr(resp['evidence']))
    print('Processing time (seconds):', repr(resp['usage']['processing_time']), '\n')

Prompt: 'Is Gibran behind fufufafa?'
Response: 'No, Gibran is not behind the Fufufafa account.'
Evidence/knowledge: ' Before and during his time as Mayor of Solo, Tempo had exchanged messages with Gibran via the same phone number.\n\nDuring a visit to Solo, Central Java, on September 10, 2024, Gibran refused to comment on the Fufufafa account controversy. "I don\'t know, ask the account owner. Why ask me?" Several sources close to the Presidential Palace revealed that Gibran had repeatedly denied that the account was his.\n She emphasized the need to verify the account\'s ownership.\n\n“It\'s not certain yet. I don\'t know if it belongs to him or not. Make sure of it first,” said Grace at the Presidential Palace complex in Jakarta on Tuesday, September 3, 2024, after a meeting on Govtech with President Jokowi and staffers.\n\nDuring the meeting, Prabowo discussed improving relations between the two countries across various sectors, particularly in defense. The Emir of Qatar also expres

## TODO

References:

- https://github.com/llmware-ai/llmware/tree/main/examples/Embedding (Migrate database)
- https://github.com/llmware-ai/llmware/tree/main/examples/Use_Cases (REST API)
- https://github.com/llmware-ai/llmware/tree/main/examples/UI (Streamlit UI)