# Homework 1

## Question 1. Run the Code and Reproduce the Results

In [29]:
from datasets import load_dataset

In [30]:
# Load the Databricks Dolly 15k dataset
dolly_dataset = load_dataset("databricks/databricks-dolly-15k")

In [31]:
# View dataset information
print(dolly_dataset)
print(dolly_dataset['train'][0]) # View the first data entry in the training set

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 15011
    })
})
{'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}


In [32]:
import chromadb

In [33]:
from sentence_transformers import SentenceTransformer

In [34]:
# 1. Select an Embedding Model
embedding_model_name = "all-mpnet-base-v2"
embedding_model = SentenceTransformer(embedding_model_name)

In [35]:
# 2. Create a ChromaDB Client and Collection
chroma_client = chromadb.Client()
collection_name = "dolly_instructions"
dolly_collection = chroma_client.get_or_create_collection(collection_name)

In [36]:
# 3. Generate Text Embeddings and Add to ChromaDB
instructions = dolly_dataset['train']['instruction'] # Get all instructions
instruction_embeddings = embedding_model.encode(instructions) # Generate embedding vectors

In [37]:
# Add data to the ChromaDB collection (ids are string-type indices, documents are original texts, embeddings are vectors)
dolly_collection.add(
    ids=[str(i) for i in range(len(instructions))], # Use indices as IDs
    embeddings=instruction_embeddings.tolist(), # ChromaDB requires list of floats
    documents=instructions # Original instruction texts
)

print(f"ChromaDB collection '{collection_name}' has added {dolly_collection.count()} data entries.")

Insert of existing embedding ID: 0
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3
Insert of existing embedding ID: 4
Insert of existing embedding ID: 5
Insert of existing embedding ID: 6
Insert of existing embedding ID: 7
Insert of existing embedding ID: 8
Insert of existing embedding ID: 9
Insert of existing embedding ID: 10
Insert of existing embedding ID: 11
Insert of existing embedding ID: 12
Insert of existing embedding ID: 13
Insert of existing embedding ID: 14
Insert of existing embedding ID: 15
Insert of existing embedding ID: 16
Insert of existing embedding ID: 17
Insert of existing embedding ID: 18
Insert of existing embedding ID: 19
Insert of existing embedding ID: 20
Insert of existing embedding ID: 21
Insert of existing embedding ID: 22
Insert of existing embedding ID: 23
Insert of existing embedding ID: 24
Insert of existing embedding ID: 25
Insert of existing embedding ID: 26
Insert of existing embedding ID: 27
In

In [38]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "How to write a poem?"
retrieved_instructions = search_instructions(query_example, dolly_collection, embedding_model)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'How to write a poem?'
Retrieved Related Instructions:
- Make a poem about color and style
- Write a poem about a raven
- What are the main characteristics of a Haiku poem?
- write a poem about the holidays
- Write a short poem about spring


## Question 2. Try Different Query Statements

In [39]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "explain quantum physics"
retrieved_instructions = search_instructions(query_example, dolly_collection, embedding_model)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'explain quantum physics'
Retrieved Related Instructions:
- What is quantum mechanics?
- what is quantum computer?
- What is a quantum computer?
- What are the reasons why quantum theory is considered do difficult to understand
- What is a quark?


In [40]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "translate English to French"
retrieved_instructions = search_instructions(query_example, dolly_collection, embedding_model)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'translate English to French'
Retrieved Related Instructions:
- Which phrases are French and which words are English: mon frère, c'est la vie, good morning, what's up, break the ice, bite the bullet, and je suis désolé.
- Please provide a brief summary of Le Souvenir français.
- Where was Old French language spoken?
- Quel a été l'impact de la révolution française ?
- Give me a good french wine with cheese


In [41]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "write a short story about a cat"
retrieved_instructions = search_instructions(query_example, dolly_collection, embedding_model)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'write a short story about a cat'
Retrieved Related Instructions:
- Write a short story about a cat named Bistro that is full of adventure.
- Write a poem about a cat owner who gets transported to a parallel universe where highly advanced cats keep humans as pets.
- Write a story about a cat named Rory and the adventures he gets up to while his owner, Maria, is gone at work. Write the story in the form of an hourly journal from 7am to 7 pm written by Rory, and convey that Rory loves to sleep while Maria is gone.
- Write a haiku about cats
- Write a haiku about a naughty cat.


## Question 3. Analyze the Advantages of Vector Databases

1. vector database for information retrival are based on meaning or context, providing a larger, more flexible and more comprehensive set of result, while exact keyword search is more accurate but more limited and with more strict demand of the data format;
2. convert documents into vectors; calculate similarity score; order by similarity and output the most similar one(s).

## Question 4. Impact of Model Selection

### all-MiniLM-L6-v2

In [67]:
# 1. Select an Embedding Model
embedding_model_name2 = "all-MiniLM-L6-v2"
embedding_model2 = SentenceTransformer(embedding_model_name2)

In [68]:
# 2. Create a ChromaDB Client and Collection
chroma_client2 = chromadb.Client()
collection_name2 = "dolly_instructions2"
dolly_collection2 = chroma_client2.get_or_create_collection(collection_name2)

In [65]:
# 3. Generate Text Embeddings and Add to ChromaDB
instructions2 = dolly_dataset['train']['instruction'] # Get all instructions
instruction_embeddings2 = embedding_model2.encode(instructions2) # Generate embedding vectors

In [69]:
# Add data to the ChromaDB collection (ids are string-type indices, documents are original texts, embeddings are vectors)
dolly_collection2.add(
    ids=[str(i) for i in range(len(instructions2))], # Use indices as IDs
    embeddings=instruction_embeddings2.tolist(), # ChromaDB requires list of floats
    documents=instructions2 # Original instruction texts
)

print(f"ChromaDB collection '{collection_name2}' has added {dolly_collection2.count()} data entries.")

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embe

In [75]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "How to write a poem?"
retrieved_instructions = search_instructions(query_example, dolly_collection2, embedding_model2)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'How to write a poem?'
Retrieved Related Instructions:
- Make a poem about color and style
- What are the main characteristics of a Haiku poem?
- Write a short poem about spring
- Write a poem about lost friendship
- Write a poem about a raven


In [77]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "explain quantum physics?"
retrieved_instructions = search_instructions(query_example, dolly_collection2, embedding_model2)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'explain quantum physics?'
Retrieved Related Instructions:
- What is quantum mechanics?
- What are the reasons why quantum theory is considered do difficult to understand
- what is quantum computer?
- What is a quantum computer?
- What are the examples of  quantum tunneling processes?


In [78]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "translate English to French"
retrieved_instructions = search_instructions(query_example, dolly_collection2, embedding_model2)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'translate English to French'
Retrieved Related Instructions:
- Which phrases are French and which words are English: mon frère, c'est la vie, good morning, what's up, break the ice, bite the bullet, and je suis désolé.
- Where was Old French language spoken?
- Write a brief passage on how to make french toast
- What are the best places to visit in France?
- What is the capital of France?


In [79]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "write a short story about a cat"
retrieved_instructions = search_instructions(query_example, dolly_collection2, embedding_model2)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'write a short story about a cat'
Retrieved Related Instructions:
- Write a short story about a cat named Bistro that is full of adventure.
- Write a poem about a cat owner who gets transported to a parallel universe where highly advanced cats keep humans as pets.
- Tell me a story about how the little kitten was rescued from a heater vent. The story should include how the kitten was bottle fed and only four days old with the umbilical cord still attached
- What is a cat?
- Write a story about a cat named Rory and the adventures he gets up to while his owner, Maria, is gone at work. Write the story in the form of an hourly journal from 7am to 7 pm written by Rory, and convey that Rory loves to sleep while Maria is gone.


### paraphrase-multilingual-mpnet-base-v2

In [71]:
# 1. Select an Embedding Model
embedding_model_name3 = "paraphrase-multilingual-mpnet-base-v2"
embedding_model3 = SentenceTransformer(embedding_model_name3)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [72]:
# 2. Create a ChromaDB Client and Collection
chroma_client3 = chromadb.Client()
collection_name3 = "dolly_instructions3"
dolly_collection3 = chroma_client3.get_or_create_collection(collection_name3)

In [73]:
# 3. Generate Text Embeddings and Add to ChromaDB
instructions3 = dolly_dataset['train']['instruction'] # Get all instructions
instruction_embeddings3 = embedding_model3.encode(instructions3) # Generate embedding vectors

In [74]:
# Add data to the ChromaDB collection (ids are string-type indices, documents are original texts, embeddings are vectors)
dolly_collection3.add(
    ids=[str(i) for i in range(len(instructions3))], # Use indices as IDs
    embeddings=instruction_embeddings3.tolist(), # ChromaDB requires list of floats
    documents=instructions3 # Original instruction texts
)

print(f"ChromaDB collection '{collection_name3}' has added {dolly_collection3.count()} data entries.")

ChromaDB collection 'dolly_instructions3' has added 15011 data entries.


In [76]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "How to write a poem?"
retrieved_instructions = search_instructions(query_example, dolly_collection3, embedding_model3)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'How to write a poem?'
Retrieved Related Instructions:
- Make a poem about color and style
- Write a poem about a raven
- Write a poem about lost friendship
- Write a short poem about spring
- Write a brief poem about cultures


In [80]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "explain quantum physics"
retrieved_instructions = search_instructions(query_example, dolly_collection3, embedding_model3)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'explain quantum physics'
Retrieved Related Instructions:
- What are the reasons why quantum theory is considered do difficult to understand
- What is quantum mechanics?
- what is quantum computer?
- What is a quantum computer?
- Which year was quantum computer demonstrated to be possible?


In [81]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "translate English to French"
retrieved_instructions = search_instructions(query_example, dolly_collection3, embedding_model3)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'translate English to French'
Retrieved Related Instructions:
- Which phrases are French and which words are English: mon frère, c'est la vie, good morning, what's up, break the ice, bite the bullet, and je suis désolé.
- Please provide a brief summary of Le Souvenir français.
- Can you decipher this english text into a commonly used phrase to practice typing?
tcnmeahkfprzeboetyqrxdhduojoeoiwuvlg
- Given this text about speakers of varieties of Louisiana French, has the number of French speakers in Texas increased or decreased since the middle of the twentieth century?
- Where was Old French language spoken?


In [82]:
def search_instructions(query, collection, embedding_model, n_results=5):
    """
    Retrieves instructions related to the query using a ChromaDB vector database.

    Args:
        query (str): User query statement
        collection (chromadb.Collection): ChromaDB collection
        embedding_model (SentenceTransformer): Sentence Transformers model
        n_results (int): Number of results to return, default is 5

    Returns:
        list: List of retrieved results, containing the instruction texts most relevant to the query.
    """
    query_embedding = embedding_model.encode(query) # Encode the query
    results = collection.query(
        query_embeddings=[query_embedding.tolist()], # ChromaDB requires list of floats
        n_results=n_results
    )
    return results['documents'][0] # Return the list of retrieved documents

# Test the retrieval function
query_example = "write a short story about a cat"
retrieved_instructions = search_instructions(query_example, dolly_collection3, embedding_model3)

print(f"Query: '{query_example}'")
print("Retrieved Related Instructions:")
for instruction in retrieved_instructions:
    print(f"- {instruction}")

Query: 'write a short story about a cat'
Retrieved Related Instructions:
- Write a short story about a cat named Bistro that is full of adventure.
- Write a poem about a cat owner who gets transported to a parallel universe where highly advanced cats keep humans as pets.
- Write a haiku about a naughty cat.
- Write a haiku about cats
- Given this paragraph about Grumpy Cat, tell me the cat's real name.


## Question 5. Real-World Application Thinking

1. company/product related chat bot
2. fact-based short answer questions