<a href="https://colab.research.google.com/github/DylanCTY/TextAnalytics_LearningSpace/blob/main/IB9CW0_5504008_pending.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Chunking

In [3]:
!pip install transformers




In [7]:
import re
import os
import uuid
from transformers import AutoTokenizer

def document_chunker(file_path,
                     model_name,
                     paragraph_separator='\n\n',
                     chunk_size=256,
                     separator=' ',
                     secondary_chunking_regex=r'\S+?[\.,;!?]',
                     chunk_overlap=0):

    tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load tokenizer for the specified model
    documents = {}  # Initialize dictionary to store results

    base = os.path.basename(file_path)
    sku = os.path.splitext(base)[0]
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Generate a unique identifier for the document
        doc_id = str(uuid.uuid4())

        # Process the file using the existing chunking logic
        paragraphs = re.split(paragraph_separator, text)
        all_chunks = {}
        for paragraph in paragraphs:
            words = paragraph.split(separator)
            current_chunk = ""
            chunks = []

            for word in words:
                new_chunk = current_chunk + (separator if current_chunk else '') + word
                if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
                    current_chunk = new_chunk
                else:
                    if current_chunk:
                        chunks.append(current_chunk)
                    current_chunk = word

            if current_chunk:
                chunks.append(current_chunk)

            refined_chunks = []
            for chunk in chunks:
                if len(tokenizer.tokenize(chunk)) > chunk_size:
                    sub_chunks = re.split(secondary_chunking_regex, chunk)
                    sub_chunk_accum = ""
                    for sub_chunk in sub_chunks:
                        if sub_chunk_accum and len(tokenizer.tokenize(sub_chunk_accum + sub_chunk + ' ')) > chunk_size:
                            refined_chunks.append(sub_chunk_accum.strip())
                            sub_chunk_accum = sub_chunk
                        else:
                            sub_chunk_accum += (sub_chunk + ' ')
                    if sub_chunk_accum:
                        refined_chunks.append(sub_chunk_accum.strip())
                else:
                    refined_chunks.append(chunk)

            final_chunks = []
            if chunk_overlap > 0 and len(refined_chunks) > 1:
                for i in range(len(refined_chunks) - 1):
                    final_chunks.append(refined_chunks[i])
                    overlap_start = max(0, len(refined_chunks[i]) - chunk_overlap)
                    overlap_end = min(chunk_overlap, len(refined_chunks[i+1]))
                    overlap_chunk = refined_chunks[i][overlap_start:] + ' ' + refined_chunks[i+1][:overlap_end]
                    final_chunks.append(overlap_chunk)
                final_chunks.append(refined_chunks[-1])
            else:
                final_chunks = refined_chunks

            # Assign a UUID for each chunk and structure it with text and metadata
            for chunk in final_chunks:
                chunk_id = str(uuid.uuid4())
                all_chunks[chunk_id] = {"text": chunk, "metadata": {"file_name":sku}}  # Initialize metadata as dict

        # Map the document UUID to its chunk dictionary
        documents[doc_id] = all_chunks

    return documents

# Assuming the file is uploaded in Colab and the path is specified as '/content/GOTbook.txt'
file_path = '/content/1_A_Game_of_Thrones.txt'
model_name = 'BAAI/bge-small-en-v1.5'  # Replace with your specific model if different

# Call the function
docs = document_chunker(file_path, model_name, chunk_size=256)

# Print the length and the first chunk of the document
keys = list(docs.keys())
print(len(docs))
print(docs[keys[0]])


0


IndexError: list index out of range

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

model_name = 'BAAI/bge-small-en-v1.5'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokenizer.save_pretrained("model/tokenizer")
model.save_pretrained("model/embedding")

In [None]:
def compute_embeddings(text):
    tokenizer = AutoTokenizer.from_pretrained("model/tokenizer")
    model = AutoModel.from_pretrained("model/embedding")

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Generate the embeddings
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze()

    return embeddings.tolist()

In [None]:
def create_vector_store(doc_store):
    vector_store = {}
    for doc_id, chunks in doc_store.items():
        doc_vectors = {}
        for chunk_id, chunk_dict in chunks.items():
            # Generate an embedding for each chunk of text
            doc_vectors[chunk_id] = compute_embeddings(chunk_dict.get("text"))
        # Store the document's chunk embeddings mapped by their chunk UUIDs
        vector_store[doc_id] = doc_vectors
    return vector_store


In [None]:
vec_store = create_vector_store(docs)

In [None]:
def compute_matches(vector_store, query_str, top_k):
    """
    This function takes in a vector store dictionary, a query string, and an int 'top_k'.
    It computes embeddings for the query string and then calculates the cosine similarity against every chunk embedding in the dictionary.
    The top_k matches are returned based on the highest similarity scores.
    """
    # Get the embedding for the query string
    query_str_embedding = np.array(compute_embeddings(query_str))
    scores = {}

    # Calculate the cosine similarity between the query embedding and each chunk's embedding
    for doc_id, chunks in vector_store.items():
        for chunk_id, chunk_embedding in chunks.items():
            chunk_embedding_array = np.array(chunk_embedding)
            # Normalize embeddings to unit vectors for cosine similarity calculation
            norm_query = np.linalg.norm(query_str_embedding)
            norm_chunk = np.linalg.norm(chunk_embedding_array)
            if norm_query == 0 or norm_chunk == 0:
                # Avoid division by zero
                score = 0
            else:
                score = np.dot(chunk_embedding_array, query_str_embedding) / (norm_query * norm_chunk)

            # Store the score along with a reference to both the document and the chunk
            scores[(doc_id, chunk_id)] = score

    # Sort scores and return the top_k results
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_k]
    top_results = [(doc_id, chunk_id, score) for ((doc_id, chunk_id), score) in sorted_scores]

    return top_results

In [None]:
import numpy as np
# Example usage
matches = compute_matches(vector_store=vec_store, query_str="Who is Jon Snow's mother?", top_k=3)

# Print the top matches
for match in matches:
    print(f"Document ID: {match[0]}, Chunk ID: {match[1]}, Similarity Score: {match[2]}")

In [None]:
# plug the top match document ID keys into doc_store to access the retrieved content
docs['3dbd9ba7-84d3-4124-9707-2e72438a50b6']['bb281824-01f8-47d2-979f-aad1f917efc3']


In [None]:
# Ensure you have the necessary packages installed
!pip install transformers sentence-transformers

from transformers import AutoModel, AutoTokenizer, GPT2Tokenizer, GPT2LMHeadModel
import torch
import numpy as np
import sys
# Use GPT-2 for streaming responses
def stream_and_buffer(base_prompt, llm, max_tokens=800, stop=None, echo=True, stream=True):
    # Formatting the base prompt
    formatted_prompt = f"Q: {base_prompt} A: "

    inputs = llm.tokenizer.encode(formatted_prompt, return_tensors='pt')
    outputs = llm.model.generate(inputs, max_length=max_tokens, pad_token_id=llm.tokenizer.eos_token_id, stop=stop)

    response = llm.tokenizer.decode(outputs[0], skip_special_tokens=True)
    buffer = response[len(formatted_prompt):]  # Remove the prompt from the response

    sys.stdout.write(buffer)
    sys.stdout.flush()

def construct_prompt(system_prompt, retrieved_docs, user_query):
    prompt = f"""{system_prompt}

Here is the retrieved context:
{retrieved_docs}

Here is the user's query:
{user_query}
"""
    return prompt

# Usage example
system_prompt = """
You are a knowledgeable historian of Westeros. You will be provided with some context from the texts of Game of Thrones, as well as the user's query.

Your job is to understand the request, and answer based on the provided context.
"""

retrieved_docs = """
Jon Snow is the son of Lyanna Stark and Rhaegar Targaryen. He was raised as the illegitimate son of Eddard Stark in Winterfell.
"""

prompt = construct_prompt(system_prompt=system_prompt,
                          retrieved_docs=retrieved_docs,
                          user_query="Who are the parents of John Snow?")

# Initialize the GPT-2 model and tokenizer
class LLMWrapper:
    def __init__(self, model_name):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

llm = LLMWrapper(model_name="gpt2")

# Stream and buffer the response
stream_and_buffer(prompt, llm)

In [None]:
# Install Ollama v0.1.30
!curl https://ollama.ai/install.sh | sed 's#https://ollama.ai/download#https://github.com/jmorganca/ollama/releases/download/v0.1.30#' | sh
# Setup the model as a global variable
OLLAMA_MODEL='phi:latest'

# Add the model to the environment of the operating system
import os
os.environ['OLLAMA_MODEL'] = OLLAMA_MODEL
!echo $OLLAMA_MODEL # print the global variable to check it saved

import subprocess
import time

# Start ollama on the server ("serve")
command = "nohup ollama serve&" # "nohup" and "&" means run in the background

# Use subprocess.Popen to run the command
process = subprocess.Popen(command,
                            shell=True,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)

print("Process ID:", process.pid) # print the process ID
time.sleep(5)  # Makes Python wait for 5 seconds

!ollama -v # print the Ollama version number as a check


In [None]:
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-ollama
!pip install llama-index-vector-stores-chroma
!pip install llama-index ipywidgets
!pip install llama-index-llms-huggingface
!pip install chromadb


In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
import sys

# Initialize the Ollama LLM
llm = Ollama(model=OLLAMA_MODEL, request_timeout=240.0)

def stream_and_buffer(base_prompt, llm, max_tokens=800, stop=None, echo=True, stream=True):
    # Formatting the base prompt
    formatted_prompt = f"Q: {base_prompt} A: "

    # Assuming `llm` has a method to run the model and get responses
    response = llm.run(formatted_prompt, max_tokens=max_tokens)

    buffer = response[len(formatted_prompt):]  # Remove the prompt from the response

    sys.stdout.write(buffer)
    sys.stdout.flush()

def construct_prompt(system_prompt, retrieved_docs, user_query):
    prompt = f"""{system_prompt}

Here is the retrieved context:
{retrieved_docs}

Here is the user's query:
{user_query}
"""
    return prompt

# Usage example
system_prompt = """
You are a knowledgeable historian of Westeros. You will be provided with some context from the texts of Game of Thrones, as well as the user's query.

Your job is to understand the request, and answer based on the provided context.
"""

retrieved_docs = """
Jon Snow is the son of Lyanna Stark and Rhaegar Targaryen. He was raised as the illegitimate son of Eddard Stark in Winterfell.
"""

prompt = construct_prompt(system_prompt=system_prompt,
                          retrieved_docs=retrieved_docs,
                          user_query="Who are the parents of Jon Snow?")

# Stream and buffer the response
stream_and_buffer(prompt, llm)
