This notebook follows the tutorial on [llamaindex](https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval/) that using all free and open source packages that executable on colab without any oth registration.

## installation  

### Llamaindex

In [None]:
%pip install llama-index-readers-file pymupdf
%pip install llama-index-vector-stores-postgres
%pip install llama-index-embeddings-huggingface
%pip install llama-index-llms-llama-cpp
%pip install llama-index-llms-gemini
%pip install llama-index
%pip install 'google-generativeai==0.3.1'
## install openai if you want to use semantic embedding
# %pip install llama-index-embeddings-openai



### pgvector & postgresql
Open-source vector similarity search for Postgres<br/>
[Github](https://github.com/pgvector/pgvector) 

In [None]:
%pip install psycopg2-binary asyncpg "sqlalchemy[asyncio]" greenlet

In [None]:

!git clone https://github.com/pgvector/pgvector.git

!apt-get install -y postgresql-server-dev-14
!apt-get install -y make gcc

%cd pgvector
!make && make install


## Get Start

## parameters

In [None]:
import sys

in_colab = 'google.colab' in sys.modules
if in_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    # set folder_path to your drive
    folder_path = '/content/drive/MyDrive/your_json_folder'
else:
    # if not Colab
    folder_path = 'path/to/your/json/folder'

# if in_colab:
#     from google.colab import files
#     uploaded = files.upload()
#     # check if the folder exists
#     import os
#     os.makedirs(folder_path, exist_ok=True)
#     # save file
#     for filename, content in uploaded.items():
#         with open(os.path.join(folder_path, filename), 'wb') as f:
#             f.write(content)


In [None]:
GEMINI_API = "<YOUR_GEMINI_API>"  
embedding_model_type = "Gemini"   # 可"Gemini" / "HuggingFace"
chunk_size = 256                 


### setup database

In [None]:
!apt-get update
!apt-get install -y postgresql postgresql-contrib
# start PostgreSQL service
import os
os.system('service postgresql start')


### Sentence Transformer (for document embedding)

In [None]:
def embed_model_factory(model_source="HuggingFace", model_name=None, model_path=None, **kwargs):
    '''
    specify the model source and model name to get certain model from various source
    '''
    if model_source == "HuggingFace":
        from llama_index.embeddings.huggingface import HuggingFaceEmbedding
        if model_name is None:
            model_name = "BAAI/bge-small-en"
        embed_model = HuggingFaceEmbedding(model_name=model_name)
    elif model_source == "Gemini":
        from llama_index.embeddings.gemini import GeminiEmbedding
        if model_name is None:
            model_name = "models/embedding-001"
        embed_model = GeminiEmbedding(api_key=GEMINI_API, model_name="models/embedding-001")
    else:
        raise ValueError(f"Unsupported embedding model source: {model_source}")
    return embed_model



In [None]:
def llm_model_factory(model_source="OpenAI", model_name=None, **kwargs):
    """
    Specify the model source and model name to get a certain LLM from various sources.
    """
    if model_source == "OpenAI":
        from llama_index.llms import OpenAI
        if model_name is None:
            model_name = "gpt-3.5-turbo"
        llm = OpenAI(
            model=model_name,
            api_key=OPENAI_API_KEY,  # OpenAI API key
            **kwargs
        )
    elif model_source == "Anthropic":
        from llama_index.llms import Anthropic
        if model_name is None:
            model_name = "claude-v1"
        llm = Anthropic(
            model=model_name,
            api_key=ANTHROPIC_API_KEY,  # Anthropic API key
            **kwargs
        )
    elif model_source == "Gemini":
        # 
        # from llama_index.llms import Gemini
        # llm = Gemini(
        #     model=model_name,
        #     api_key=GEMINI_API_KEY,  # Gemini API key
        #     **kwargs
        # )
        pass  # Placeholder for Gemini LLM
    else:
        raise ValueError(f"Unsupported model source: {model_source}")
    return llm

## other adjustment
[Order of evidence](https://arxiv.org/pdf/2305.13300)

# Get started

Reference [llamaindex gemini](https://docs.llamaindex.ai/en/stable/examples/llm/gemini/)

In [None]:
# import 
from llama_index.llms.gemini import Gemini
from llama_index.core.llms import ChatMessage


# Test complete
response = Gemini(api_key=GEMINI_API).complete("Hello world!")
print(response)

# Test chat
# Chat
messages = [
    ChatMessage(role="user", content="Hello friend!"),
    ChatMessage(role="assistant", content="Yarr what is shakin' matey?"),
    ChatMessage(
        role="user", content="Help me decide what to have for dinner."
    ),
]
resp = Gemini().chat(messages)
print(resp)

In [None]:
import os

pg_hba_path = "/etc/postgresql/14/main/pg_hba.conf"

with open(pg_hba_path, "r") as file:
    lines = file.readlines()

jerry_host_added = False
### be careful when you adjust permission on your own computer
# adjust the permission for database
with open(pg_hba_path, "w") as file:
    for line in lines:
        if line.startswith("local   all             postgres"):
            file.write("local   all             postgres                                trust\n")
        elif line.startswith("host    all             all             127.0.0.1/32"):
            file.write("host    all             all             127.0.0.1/32            trust\n")
        elif line.startswith("host    all             all             ::1/128"):
            file.write("host    all             all             ::1/128                 trust\n")
        else:
            file.write(line)
    if not jerry_host_added:
        file.write("host    all             jerry            0.0.0.0/0               trust\n")
        jerry_host_added = True

In [None]:
os.system('service postgresql restart')

In [None]:
# import psycopg2

db_name = "vector_db"
host = "localhost"
password = "password"
port = "5432"
user = "jerry"
# conn = psycopg2.connect(connection_string)
conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [None]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="lawGPT",
    embed_dim=384,  # openai embedding dimension
)

## Chunking

In [None]:
import os
import json
from llama_index.core.schema import TextNode
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(chunk_size=chunk_size)
embed_model = embed_model_factory(model_source=embedding_model_type)


# init node
nodes = []

# retrive all json documents
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        
        with open(file_path, 'r') as f:
            data = json.load(f)

        # handle by section
        for section_data in data['sections']:
            section = section_data['section']
            content = section_data['content']
            link = section_data['link']

            # chunking
            text_chunks = text_parser.split_text(content)

            # embed each chunk and create node
            for idx, text_chunk in enumerate(text_chunks):
                embedding = embed_model.get_text_embedding(text_chunk)
                node = TextNode(
                    text=text_chunk,
                    embedding=embedding,
                    metadata={
                        "section": section,
                        "link": link,
                        "filename": filename 
                    }
                )
                nodes.append(node)




In [None]:
# you can also try other model for embedding
# sentence transformers
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# default model: HuggingFace, BAAI/bge-small-en
llm = llm_model_factory()

In [None]:
from llama_index import VectorStoreIndex

index = VectorStoreIndex(
    nodes,
    vector_store=vector_store,
    embed_model=embed_model
)

# 创建查询引擎
query_engine = index.as_query_engine(llm=llm)

# 示例查询
response = query_engine.query("What type of visa can I have if I just got my german university degree? How long is it?")
print(response)