In [None]:
# !pip install sentence_transformers

Successfully installed sympy-1.13.1


In [None]:
!pip install tf-keras

In [5]:
import os
from dotenv import load_dotenv
# import openai
import chromadb
from sentence_transformers import SentenceTransformer  # For embeddings

In [6]:
# Load environment variables from .env file
load_dotenv(override=True)

# Retrieve the API key
cohere_key = os.getenv("COHERE_API_KEY")

In [7]:
import cohere
import qdrant_client
from qdrant_client import QdrantClient
from qdrant_client.models import Batch

# Initialize clients
cohere_client = cohere.ClientV2(cohere_key)
# qdrant_client = qdrant_client.QdrantClient(host="localhost", port=6333)

In [15]:
qdrant_client = QdrantClient(host="host.docker.internal", port=6333)

In [35]:
import os
from typing import List
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor

def process_pdf_to_text(file_path: str, num_threads: int = 4) -> str:
    """
    Processes a PDF file and extracts its content as a single text string.

    Args:
        file_path: The path to the PDF file.
        num_threads: The number of threads to use for parallel page processing.

    Returns:
        The extracted text as a single string with no newlines. Returns an empty string on error.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file is not a PDF.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    if not file_path.lower().endswith(".pdf"):
        raise ValueError(f"File is not a PDF: {file_path}")

    try:
        with open(file_path, 'rb') as file:
            reader = PdfReader(file)
            num_pages = len(reader.pages)

            # Use ThreadPoolExecutor to process pages in parallel
            with ThreadPoolExecutor(max_workers=num_threads) as executor:
                page_texts = list(executor.map(extract_page_text, [(reader, i) for i in range(num_pages)]))

            return " ".join(page_texts)  # Join all pages into a single string without newlines

    except Exception as e:
        print(f"Error processing PDF {file_path}: {e}")  # More specific error
        return ""

def extract_page_text(args):
    """Helper function for parallel PDF page extraction."""
    reader, page_num = args
    try:
        page = reader.pages[page_num]
        return " ".join(page.extract_text().split())  # Remove newlines and extra spaces
    except Exception as e:
        print(f"Error processing page {page_num}: {e}")  # Error handling
        return ""

In [36]:
extracted_text = process_pdf_to_text("./legal.pdf")
print(f"Extracted Text:\n{extracted_text}")

Extracted Text:
Additional Application Information If we request additional information to process your application, including but not limited to income and additional identification documentation, please ensure that it is submitted to the Leasing Office within 3 calendar days of the request. If documentation is not received within 3 days, the application will be canceled. Please note that processing of your application may take up to 10 days. Hold Deposit The hold deposit will be applied to the balance due at move- in. If written cancellation is received within 3 days from the date of application or if we cannot approve your application, a refund of the hold deposit payment will be mailed within 20 business days. Required documents must be received within 3 days of the request, or your application will be canceled, and the hold deposit will be forfeited. Security Deposit Requirement The Refundable Security Deposit amount disclosed here is subject to change, pending final credit review

Define new extracted text

In [8]:
def get_text_from_file(file_path):
  """Reads all text content from a specified file path.

  Args:
    file_path: The full path to the text file.

  Returns:
    A string containing the entire text content of the file,
    or None if the file cannot be read.
  """
  extracted_text = None
  try:
    # Ensure the file path exists
    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as f:
        extracted_text = f.read()
    else:
      print(f"Error: File not found at {file_path}")
  except Exception as e:
    print(f"An error occurred while reading the file: {e}")
  return extracted_text

# Example usage:
file_location = "/Users/ruhwang/Desktop/AI/spring2025_courses/aipi590-llms/auritas/case_studies.txt"
extracted_text_object = get_text_from_file(file_location)

# You can now use the extracted_text_object variable which holds the file content
if extracted_text_object is not None:
  print("File content loaded successfully.")
  # print(extracted_text_object) # Uncomment to print the content

File content loaded successfully.


In [10]:
extracted_text = extracted_text_object

In [11]:
def customize_chunking(text, chunk_size=300):
    list_of_chunks = []
    chunk = ""
    for i in range(0, len(text), chunk_size):
        chunk += text[i:i+chunk_size] + "\n"
        list_of_chunks.append(chunk)
    return list_of_chunks

In [12]:
list_chunks = customize_chunking(extracted_text)
print(f"Number of chunks: {len(list_chunks)}")

Number of chunks: 82


In [13]:
len(list_chunks[0])

301

In [14]:
print(list_chunks[0])

Nielsen Holdings PLC, an American information, data, and market measurement firm operating in over 100 countries with approximately 44,000 employees worldwide and assets over 3.5 Billion dollars, sought to implement and streamline SAP BTP Document Management. Founded in 1923, Nielsen looked to embra



In [16]:
# Generating the embeddings with Cohere client library
embeddings = cohere_client.embed(
    texts=list_chunks,
    model="embed-english-light-v3.0",
    input_type="search_document",
    embedding_types=["float"]
)

In [17]:
type(embeddings)

cohere.types.embed_by_type_response.EmbedByTypeResponse

In [19]:
def query_chunking(query):
    response = cohere_client.embed(
        texts=query,
        model="embed-english-light-v3.0",
        input_type="search_query",
        embedding_types=["float"]
    )
    return response # ["results"][0]["text"]

In [20]:
query = ['How do I do online payments?']
query_chunks = query_chunking(query)
query_embeddings = query_chunks.embeddings.float

In [21]:
# Extracting the embeddings
embedding_floats = embeddings.embeddings.float

https://docs.cohere.com/v2/reference/embed

### QDrant

In [22]:
qdrant_key = os.getenv("QDRANT_API_KEY")
qdrant_host = os.getenv("QDRANT_HOST")

In [23]:
qdrant_client = QdrantClient(
    url=qdrant_host,  # Your Qdrant Cloud URL
    # prefer_grpc=True,  # Use gRPC for better performance
    api_key=qdrant_key # Required for Qdrant Cloud authentication
)

# Test the connection by listing collections
# print(qdrant_client.get_collections())

In [24]:
print(qdrant_client.get_collections())

collections=[CollectionDescription(name='daves-collection'), CollectionDescription(name='new-collection'), CollectionDescription(name='daves-rag')]


In [None]:
# qdrant_client.delete_collection("new-collection")
# print("Collection 'daves-rag' deleted successfully.")

False

In [25]:
from qdrant_client.models import VectorParams, Distance
from qdrant_client import models

In [26]:
type(embedding_floats), type(embedding_floats[0]), type(embedding_floats[0][0])

(list, list, float)

In [60]:
qdrant_client.delete_collection("auritas")
# print("Collection 'daves-rag' deleted successfully.")

True

Create Collection and adding items

In [61]:
qdrant_client.create_collection(
    collection_name="auritas",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),  # Ensure 384 is correct
)

True

In [None]:
# qdrant_client.upsert(
#     collection_name="auritas",
#     points=models.Batch(
#         ids=[i for i in range(len(embedding_floats))],  # Unique IDs for each embedding
#         payloads=[{"text": f"Document {i}"} for i in range(len(embedding_floats))],  # Optional metadata
#         vectors=embedding_floats,  # Ensure embeddings are a List[List[float]]
#     ),
# )

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

Crucial

In [62]:
# Store both embeddings and the actual text chunks
qdrant_client.upsert(
    collection_name="auritas",
    points=models.Batch(
        ids=[i for i in range(len(embedding_floats))],  # Unique IDs for each embedding
        payloads=[{"text": list_chunks[i]} for i in range(len(embedding_floats))],  # Store actual text chunks
        vectors=embedding_floats,  # The embeddings
    ),
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

https://qdrant.tech/documentation/frameworks/langchain/#


https://qdrant.tech/documentation/guides/installation/

In [None]:
collection_info = qdrant_client.get_collection("new-collection")
print(collection_info)

In [None]:
collection_info = qdrant_client.get_collection("daves-rag")
print(collection_info)

In [29]:
collection_info = qdrant_client.get_collection("auritas")
print(collection_info)

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=82 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None,

#### Without Manual input

In [30]:
# from qdrant_client import QdrantClient

# Initialize client
# qdrant_client = QdrantClient(host="localhost", port=6333)  # Use 'url=' if using Qdrant Cloud

# Define query embedding (Must match the vector size of the collection)

# Perform vector search
top_results = qdrant_client.search(
    collection_name="auritas",  # Your collection name
    query_vector=query_embeddings[0],
    limit=5  # Return the top 5 most similar documents
)

# Print results
for result in top_results:
    print(f"Document ID: {result.id}, Score: {result.score}, Payload: {result.payload}")

Document ID: 5, Score: 0.05813939, Payload: {'text': 'Document 5'}
Document ID: 6, Score: 0.05446569, Payload: {'text': 'Document 6'}
Document ID: 4, Score: 0.049721014, Payload: {'text': 'Document 4'}
Document ID: 7, Score: 0.04711134, Payload: {'text': 'Document 7'}
Document ID: 79, Score: 0.04009522, Payload: {'text': 'Document 79'}


  top_results = qdrant_client.search(


In [31]:
import numpy as np

In [32]:
qdrant_client.scroll(collection_name="auritas", limit=5)

([Record(id=0, payload={'text': 'Document 0'}, vector=None, shard_key=None, order_value=None),
  Record(id=1, payload={'text': 'Document 1'}, vector=None, shard_key=None, order_value=None),
  Record(id=2, payload={'text': 'Document 2'}, vector=None, shard_key=None, order_value=None),
  Record(id=3, payload={'text': 'Document 3'}, vector=None, shard_key=None, order_value=None),
  Record(id=4, payload={'text': 'Document 4'}, vector=None, shard_key=None, order_value=None)],
 5)

In [33]:
print(f"First vector shape: {len(embedding_floats[0])}")  # Should be 384
print(f"First vector: {embedding_floats[0][:5]}")  # Print first 5 values


First vector shape: 384
First vector: [0.012359619, -0.07476807, -0.07318115, -0.026184082, -0.05532837]


In [34]:
stored_vectors, _ = qdrant_client.scroll(collection_name="auritas", limit=5)

for vec in stored_vectors:
    if vec.vector is None:
        print(f"🚨 Missing vector for ID {vec.id}")
    else:
        print(f"✅ ID: {vec.id}, Vector: {vec.vector[:5]}")  # Print first 5 values

🚨 Missing vector for ID 0
🚨 Missing vector for ID 1
🚨 Missing vector for ID 2
🚨 Missing vector for ID 3
🚨 Missing vector for ID 4


In [None]:
# qdrant_client = QdrantClient(host="host.docker.internal", port=6333)

In [None]:
# !docker ps

CONTAINER ID   IMAGE           COMMAND             CREATED        STATUS             PORTS                              NAMES
0b78c463ca42   qdrant/qdrant   "./entrypoint.sh"   18 hours ago   Up About an hour   0.0.0.0:6333-6334->6333-6334/tcp   gifted_goldstine


In [35]:
type(embedding_floats[0])

list

In [36]:
retrieved_points = qdrant_client.scroll(collection_name="daves-rag")

In [37]:
for point in retrieved_points:
    # print(f"ID: {point.id}, Vector: {point.vector}")
    print(point)

[Record(id=0, payload={'label': 'sample_0'}, vector=None, shard_key=None, order_value=None), Record(id=1, payload={'label': 'sample_1'}, vector=None, shard_key=None, order_value=None), Record(id=2, payload={'label': 'sample_2'}, vector=None, shard_key=None, order_value=None), Record(id=3, payload={'label': 'sample_3'}, vector=None, shard_key=None, order_value=None), Record(id=4, payload={'label': 'sample_4'}, vector=None, shard_key=None, order_value=None), Record(id=5, payload={'label': 'sample_5'}, vector=None, shard_key=None, order_value=None), Record(id=6, payload={'label': 'sample_6'}, vector=None, shard_key=None, order_value=None), Record(id=7, payload={'label': 'sample_7'}, vector=None, shard_key=None, order_value=None), Record(id=8, payload={'label': 'sample_8'}, vector=None, shard_key=None, order_value=None), Record(id=9, payload={'label': 'sample_9'}, vector=None, shard_key=None, order_value=None)]
10


In [38]:
print(type(embedding_floats))  # Should be <class 'list'>
print(len(embedding_floats))   # Should match the number of vectors
print(type(embedding_floats[0]))  # Should be <class 'list'>
print(len(embedding_floats[0]))  # Should match vector dimension (e.g., 384)


<class 'list'>
82
<class 'list'>
384


In [44]:
stored_vectors, _ = qdrant_client.scroll(
    collection_name="auritas",
    with_vectors=True,
    with_payload=True,
) 

In [45]:
len(stored_vectors)

10

In [46]:
len(list_chunks)

82

In [39]:
stored_vectors, _ = qdrant_client.scroll(
    collection_name="auritas",
    with_vectors=True,
    with_payload=True,
    limit=5
) 

for vec in stored_vectors:
    print(f"ID: {vec.id}, Vector: {vec.vector}, Text: {vec.payload}")

ID: 0, Vector: [0.012355533, -0.07474335, -0.07315696, -0.026175426, -0.055310078, -0.052930493, -0.04679849, 0.12172488, -0.034198895, 0.0064637745, 0.078953385, -0.01258434, 0.042283382, -0.020775601, -0.05601175, 0.11598948, 0.019234972, -0.1266671, -0.0010019804, 0.050428882, 0.0185333, 0.019250225, 0.005106191, -0.051954255, 0.054486375, 0.0358463, 0.06687242, -0.030827818, 0.06220477, 0.00041566454, 0.006879439, -0.08523793, 0.07639075, 0.14509362, 0.11312177, -0.0022670878, 0.020455271, -0.02252978, 0.110315084, 0.009464948, -0.00016016432, 0.025214441, 0.016245238, -0.011074219, 0.015497805, 0.01694691, 0.07096042, 0.002766648, -0.03087358, -0.006242595, -0.025961874, 0.0945122, -0.034534477, -0.1169047, 0.04365622, -0.031880327, -0.01136404, -0.010746263, -0.028951608, -0.038683496, 0.011905548, -0.04423586, 0.08828867, -0.034107372, -0.08627518, -0.013240251, -0.049483147, 0.025199186, 0.015513059, -0.0073332377, -0.049757715, -0.09176653, -0.06833678, 0.01124201, 0.001857143

In [58]:
??query_chunking

[0;31mSignature:[0m [0mquery_chunking[0m[0;34m([0m[0mquery[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mquery_chunking[0m[0;34m([0m[0mquery[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0mresponse[0m [0;34m=[0m [0mcohere_client[0m[0;34m.[0m[0membed[0m[0;34m([0m[0;34m[0m
[0;34m[0m        [0mtexts[0m[0;34m=[0m[0mquery[0m[0;34m,[0m[0;34m[0m
[0;34m[0m        [0mmodel[0m[0;34m=[0m[0;34m"embed-english-light-v3.0"[0m[0;34m,[0m[0;34m[0m
[0;34m[0m        [0minput_type[0m[0;34m=[0m[0;34m"search_query"[0m[0;34m,[0m[0;34m[0m
[0;34m[0m        [0membedding_types[0m[0;34m=[0m[0;34m[[0m[0;34m"float"[0m[0;34m][0m[0;34m[0m
[0;34m[0m    [0;34m)[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mresponse[0m [0;31m# ["results"][0]["text"][0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      /var/folders/rt/0kf7v29569z7rctzmpst_dsh0000gn/T/ipyker

In [40]:
def retrieve_top_chunks(query:str, collection_name, chunks, n=5):
    # Fetch all stored points
    stored_points = qdrant_client.scroll(collection_name="daves-rag", with_vectors=True, limit=1000)[0]
    query_chunks = query_chunking([query])
    query_embeddings = query_chunks.embeddings.float
    
    # Extract embeddings & IDs
    chunk_embeddings = [point.vector for point in stored_points]
    stored_ids = [point.id for point in stored_points]
    
    def cosine_similarity(a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    # --- Compute Similarity Scores ---
    similarities = []
    for chunk_embedding in chunk_embeddings:
        subquery_scores = [cosine_similarity(query_embedding, chunk_embedding) for query_embedding in query_embeddings]
        similarities.append(np.mean(subquery_scores))  # Average similarity if multiple subqueries

    print("Similarity scores:", similarities)

    # --- Retrieve Top `n` Chunks ---
    top_indices = np.argsort(similarities)[::-1][:n]  # Sort and get top `n`

    # Retrieve top similar document chunks
    top_chunks_after_retrieval = [chunks[i] for i in top_indices]

    return top_chunks_after_retrieval

In [41]:
import pandas as pd

In [65]:
def inspect_qdrant_collection(collection_name="auritas", limit=20):
    try:
        # Get the first few points from the collection
        points = qdrant_client.scroll(
            collection_name=collection_name,
            limit=limit,
            with_payload=True
        )[0]

        print(f"Found {len(points)} points in collection {collection_name}")
        for i, point in enumerate(points):
            print(f"Point {i} payload: {point.payload}")

        return points
    except Exception as e:
        print(f"Error inspecting collection: {str(e)}")
        return []

In [42]:
def get_llm_output(top_chunks, ch, query):
    preamble = """
    ## Task & Context
    You give answers to user's questions with precision, based on chunked document string you receive.
    You should focus on serving the user's needs as best you can, which can be wide-ranging but always relevant to the document string.
    If you are not sure about the answer, you can ask for clarification or provide a general response saying you are not sure.
    
    ## Style Guide
    Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.
    """

    # retrieved documents
    documents = [
        {"data": {"title": f"chunk {i}", "snippet": top_chunks[i]}} for i in range(len(top_chunks))
    ]

    # get model response
    response = ch.chat(
        model="command-r-08-2024",
        messages=[{"role": "system", "content": preamble},
                  {"role": "user", "content": query}],
        documents=documents,  
        temperature=0.3
    )

    print("Final answer:")
    print(response.message.content[0].text)


In [59]:
list_chunks[0]

'Nielsen Holdings PLC, an American information, data, and market measurement firm operating in over 100 countries with approximately 44,000 employees worldwide and assets over 3.5 Billion dollars, sought to implement and streamline SAP BTP Document Management. Founded in 1923, Nielsen looked to embra\n'

In [43]:
query="How can auritas help with data migration?"
ch = cohere.ClientV2(cohere_key)

top_chunks = retrieve_top_chunks(query=query, 
                                 collection_name="auritas", 
                                 chunks=list_chunks,
                                 n=5)
print(get_llm_output(top_chunks, ch, query))

Similarity scores: [0.032323270489576124, 0.08858491040083244, 0.07804837155779201, 0.07615967539076995, 0.082133896019837, 0.07681949617836238, 0.06833293063347465, 0.07021145034228328, 0.07315295093838571, 0.07315295093838571, 0.07315295093838571, 0.07315295093838571, 0.07315295093838571, 0.07315295093838571, 0.07315295093838571, 0.07315295093838571, 0.07315295093838571]
Final answer:
Auritas can help with data migration by integrating a company's SAP systems with SAP's Business Technology Platform (BTP) using ArchiveLink and SAP Document Management System (DMS).

ArchiveLink is used to migrate attachments from the SAP database to the BTP environment by adjusting configurations for business objects, enabling seamless access to documents stored on BTP from within SAP.
None


In [53]:
!pip install 'qdrant-client[fastembed]' --quiet

In [55]:
qdrant_client.query(
    collection_name="auritas",
    query_text="How did auritas help with data migration?",
    limit=1
)

ImportError: fastembed is not installed. Please install it to enable fast vector indexing with `pip install fastembed`.

https://cloud.qdrant.io/accounts/5afd6788-b042-400e-82cd-baa221f504f0/clusters/b2e7691c-8588-485a-938b-cc899e360d62/overview

https://qdrant.tech/documentation/embeddings/cohere/

https://qdrant.tech/articles/qa-with-cohere-and-qdrant/

https://medium.com/@sanket.ai/building-a-streamlit-application-for-interactive-questioning-with-pdf-using-openai-and-langchain-dc82a0d8d68a



In [118]:
type(embeddings)

list

In [None]:
from langchain.schema import Document

# Create a list of Document objects
documents = [Document(page_content=chunk) for chunk in list_chunks]