# SETUP

In [None]:
%%capture
!pip install chromadb tqdm fireworks-ai python-dotenv pandas
!pip install sentence-transformers

In [None]:
import fireworks.client
import os
import dotenv
import chromadb
import json
from tqdm.auto import tqdm
import pandas as pd
import random
from zipfile import ZipFile
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# LOADING THE MODEL

In [None]:
dotenv.load_dotenv()

fireworks.client.api_key = "API KEY"

In [None]:
def get_completion(prompt, model=None, max_tokens=50):

    fw_model_dir = "accounts/fireworks/models/"

    if model is None:
        model = fw_model_dir + "llama-v2-7b"
    else:
        model = fw_model_dir + model

    completion = fireworks.client.Completion.create(
        model=model,
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=0
    )

    return completion.choices[0].text

# TESTING THE MODEL

In [None]:
get_completion("Hi, my name is")

' Katie. I am a 20 year old college student. I am a very outgoing person and I love to meet new people. I am a very open minded person and I am very easy to get along with. I am a'

In [None]:
mistral_llm = "mistral-7b-instruct-4k"

get_completion("Hi, my name is", model=mistral_llm)

" Alex. I'm a 20 year old male from the United States. I'm here to ask for advice on how to get into the field of psychology. I'm interested in studying the human mind and behavior, and I want"

# IMPORTING DATA

In [None]:
csv_file_path = "/content/drive/MyDrive/My projects/LLM RAG BigFoot/BigFootStories.csv"

data_frame = pd.read_csv(csv_file_path)

In [None]:
bigfoot = data_frame
print(bigfoot.shape)

(5082, 29)


In [None]:
bigfoot.head()

Unnamed: 0,observed,location_details,county,state,season,title,latitude,longitude,date,number,...,precip_intensity,precip_probability,precip_type,pressure,summary,conditions,uv_index,visibility,wind_bearing,wind_speed
0,I am not sure how relevant this report will be...,"We were on our way to Rapid City, so we were h...",Washakie County,Wyoming,Summer,,,,,798.0,...,,,,,,,,,,
1,I don't know if what I saw was two bigfoots or...,"Heading to the deep mine Poca #2, the airshaft...",Wyoming County,West Virginia,Winter,Report 13237: Daylight sighting near an abando...,37.58135,-81.29745,2005-12-03,13237.0,...,0.204,100.0,"['rain', 'snow']",1018.7,Partly cloudy throughout the day with late aft...,"Snow, Rain, Partially cloudy",,9.3,168.1,17.1
2,"My family and I went to Ludlow, Vermont for Co...",It's off Rt 100 outside of Ludlow Vermont. It ...,Windsor County,Vermont,Fall,Report 13285: Evening sighting by motorists on...,43.4654,-72.7051,2005-10-08,13285.0,...,3.208,100.0,['rain'],1011.8,Cloudy skies throughout the day with a chance ...,"Rain, Overcast",,5.0,356.2,7.6
3,It was spring break 1984 and I was 16 at the t...,"Wythe county Virginia near Wytheville, looking...",Wythe County,Virginia,Spring,"Report 2285: Boy sees ""Bigfoot"" in the woods w...",37.22647,-81.09017,1984-04-08,2285.0,...,0.0,0.0,,1020.9,Partly cloudy throughout the day.,Partially cloudy,,13.0,107.3,11.5
4,It was the winter of 1996 and we were on our w...,"Hwy 182, Wood County Between Quitman, Texas an...",Wood County,Texas,Winter,Report 2048: Night time road crossing observation,32.7943,-95.5425,1996-12-22,2048.0,...,,0.0,,,Partly cloudy throughout the day.,Partially cloudy,,10.5,180.4,20.8


In [None]:
bigfoot = bigfoot[["observed", "location_details"]]

In [None]:
bigfoot.head()

Unnamed: 0,observed,location_details
0,I am not sure how relevant this report will be...,"We were on our way to Rapid City, so we were h..."
1,I don't know if what I saw was two bigfoots or...,"Heading to the deep mine Poca #2, the airshaft..."
2,"My family and I went to Ludlow, Vermont for Co...",It's off Rt 100 outside of Ludlow Vermont. It ...
3,It was spring break 1984 and I was 16 at the t...,"Wythe county Virginia near Wytheville, looking..."
4,It was the winter of 1996 and we were on our w...,"Hwy 182, Wood County Between Quitman, Texas an..."


# RAG

In [None]:
bigfoot_dict = bigfoot.to_dict(orient="records")

In [None]:
bigfoot_dict[0]

{'observed': 'I am not sure how relevant this report will be, however I thought it important to add so that any other possible events in the area could be correlated. I was driving; my wife was in the passenger seat and had the "sighting". We had already been through Yellowstone, on a cross country trip and had been on the lookout for wildlife all through our driving, so my wife was very sure of what she saw. She had been good about picking out Antelope, Deer, and a Grizzly bear on earlier days during our trip. We were proceeding up the mountain pass, and were in the middle of the switchbacks when my wife saw what she described as "A man in dark clothes" on the side of the road on an upcoming part of the pass. She was visibly surprised that the "Man" was not there as we turned the corner where he would have been. There were steep cliffs on each side of the road, with no turnouts and very little shoulder. She told me that she was paying attention as we rounded the switchback were he wou

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        batch_embeddings = embedding_model.encode(input)
        return batch_embeddings.tolist()

embed_fn = MyEmbeddingFunction()

# Initialize the chromadb directory, and client.
client = chromadb.PersistentClient(path="/content/drive/MyDrive/UCA/NLP") # THIS IS MY OWN PATH

# create collection
collection = client.get_or_create_collection(
    name=f"BigFoot-Stories"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Generate embeddings, and index titles in batches
batch_size = 128

# loop through batches and generated + store embeddings
for i in tqdm(range(0, len(bigfoot_dict), batch_size)):

    i_end = min(i + batch_size, len(bigfoot_dict))
    batch = bigfoot_dict[i : i + batch_size]

    # Replace observed with "No observation" if empty string
    batch_stories = [str(story["observed"]) if str(story["observed"]) != "" else "No observation" for story in batch]
    batch_ids = [str(sum(ord(c) + random.randint(1, 10000) for c in str(story["observed"]))) for story in batch]

    # generate embeddings
    batch_embeddings = embedding_model.encode(batch_stories)

    # upsert to chromadb
    collection.upsert(
        ids=batch_ids,
        documents=batch_stories,
        embeddings=batch_embeddings.tolist(),
    )



  0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
collection = client.get_or_create_collection(
    name=f"BigFoot-Stories",
    embedding_function=embed_fn
)

retriever_results = collection.query(
    query_texts=["BigFoot Stories"],
    n_results=2,
)

print(retriever_results["documents"])

[['when i was a kid we lived out side of yakima out in the country by parker a little town out side of yakima . Near the yakima river matter of fact we were so close to the river that in the spring when the river rose we had to move the animals an go into town .the woods were thick we had a pig pen it was a long ways away from the house really thickly wooded. all the time we lived there you wood hear shrill whistling whistling in the bushes at this time we were maybe ten years old never heard of bigfoot before im 48 years old now so it was long ago.It was in the ninteen sixtys way before roger patterson nobody knew about bigfoot . Well anyway us kids were sent into the woods to feed the cows at the big pond . We had two cows we had a little wagon with a bail of hay on it .We left it there for them to eat we left and stopped by the river to talk. Four of us kids we were all looking the same way when all of a sudden 30 feet away something something stepped from the clearing in full view 

# TASK 1: Bigfoot aesthetics

In [None]:
user_query = "bigfoot aesthetics"

results = collection.query(
    query_texts=[user_query],
    n_results=10,
)

stories = '\n'.join(results['documents'][0])


prompt_template = f'''[INST]

Your task is to provide a summary DESCRIPTION of what the bigfoot looks like, and your description must be based on OBSERVATIONS reports

You should provide a precise and concise answer, and PLEASE DO NOT include the same words that are in OBSERVATIONS, only generate a summary DESCRIPTION.

OBSERVATIONS: {stories}

DESCRIPTION:

[/INST]
'''

responses = get_completion(prompt_template, model=mistral_llm, max_tokens=500)
suggested_titles = ''.join([str(r) for r in responses])

# Print the suggestions.
print("Model Suggestions:")
print(suggested_titles)
print("\n\n\nPrompt Template:")
print(prompt_template)

Model Suggestions:

Based on the observations reported, the bigfoot is described as being at least 7 feet tall, hairy, and fast, with a strange, almost human-like gait. It is also said to have large, footprints with no arch, and the texture of a crumbled paper bag. Some witnesses have also reported seeing the bigfoot near bodies of water, such as rivers and lakes, and hearing strange, otherworldly cries. Overall, the bigfoot is described as a mysterious, elusive creature that is difficult to observe and document.



Prompt Template:
[INST]

Your task is to provide a summary DESCRIPTION of what the bigfoot looks like, and your description must be based on OBSERVATIONS reports

You should provide a precise and concise answer, and PLEASE DO NOT include the same words that are in OBSERVATIONS, only generate a summary DESCRIPTION.

OBSERVATIONS: I saw a bigfoot and I have pictures of its foot prints
I saw a bigfoot and I have pictures of its foot prints
I know of a friend who saw a bigfoot 

In [None]:
# Test 1 with only mistral knowledge

mistral_llm = "mistral-7b-instruct-4k"

get_completion("What does the bigfoot look like?", model=mistral_llm)

'\n\nThe bigfoot, also known as Sasquatch, is a legendary creature said to inhabit parts of North America. Descriptions of its appearance vary widely, but most accounts describe it as a large, hairy, ape-like'

# TASK 2: Bigfoot footprints

In [None]:
user_query = "bigfoot footprints"

results = collection.query(
    query_texts=[user_query],
    n_results=10,
)

stories = '\n'.join(results['documents'][0])


prompt_template = f'''[INST]

Your task is to provide a summary DESCRIPTION of the bigfoot footprints, and your description must be based on OBSERVATIONS reports

You should provide a precise and concise answer, and PLEASE DO NOT include the same words that are in OBSERVATIONS, only generate a summary DESCRIPTION.

OBSERVATIONS: {stories}

DESCRIPTION:

[/INST]
'''

responses = get_completion(prompt_template, model=mistral_llm, max_tokens=500)
suggested_titles = ''.join([str(r) for r in responses])

# Print the suggestions.
print("Model Suggestions:")
print(suggested_titles)
print("\n\n\nPrompt Template:")
print(prompt_template)

Model Suggestions:

The bigfoot footprints are described as being very large and wide, with a space between them that is too wide to be human. They were found in a vegetable garden and in the snow, and were left behind by a creature that is believed to be a bigfoot. The prints were estimated to be around 14 inches long with toes, and had a slight curve in the arch area. Some of the prints were found in the snow and were described as being about 5 inches longer and double the width of a man's shoe, while others were found in the vegetable garden and were described as being perfect because the dirt was just tilled for spring planting. The prints were not captured with plaster or a picture, but the witnesses were able to estimate their size and shape based on their own footprints. The prints were left as a mystery until they were recently featured on a television show about bigfoot.



Prompt Template:
[INST]

Your task is to provide a summary DESCRIPTION of the bigfoot footprints, and yo

In [None]:
# Test 2 with only mistral knowledge

mistral_llm = "mistral-7b-instruct-4k"

get_completion("Describe bigfoot footprints", model=mistral_llm)

'.\n\nBigfoot, also known as Sasquatch, is a legendary creature said to inhabit parts of North America. If Bigfoot exists, it is likely that it leaves behind footprints, which can provide evidence of its existence. Big'

# TASK 3: Bigfoot locations

In [None]:
user_query = "bigfoot locations"

results = collection.query(
    query_texts=[user_query],
    n_results=10,
)

stories = '\n'.join(results['documents'][0])


prompt_template = f'''[INST]

Your task is to provide a summary DESCRIPTION of the locations where it is common to encounter the bigfoot, and your description must be based on OBSERVATIONS reports

You should provide a precise and concise answer, and PLEASE DO NOT include the same words that are in OBSERVATIONS, only generate a summary DESCRIPTION.

OBSERVATIONS: {stories}

DESCRIPTION:

[/INST]
'''

responses = get_completion(prompt_template, model=mistral_llm, max_tokens=500)
suggested_titles = ''.join([str(r) for r in responses])

# Print the suggestions.
print("Model Suggestions:")
print(suggested_titles)
print("\n\n\nPrompt Template:")
print(prompt_template)

Model Suggestions:

Based on the observations provided, it appears that bigfoot sightings are most common in areas with dense forests and mountainous terrain, such as Ringwood, NJ, Granite City, IL, and the Cascade foothills in Washington state. These areas may provide suitable habitats for bigfoot, as they offer ample cover and food sources. Additionally, the presence of bigfoot research teams in these areas may indicate a higher likelihood of encountering the creature. However, it is important to note that bigfoot sightings are generally considered to be unproven and many people do not believe in their existence.



Prompt Template:
[INST]

Your task is to provide a summary DESCRIPTION of the locations where it is common to encounter the bigfoot, and your description must be based on OBSERVATIONS reports

You should provide a precise and concise answer, and PLEASE DO NOT include the same words that are in OBSERVATIONS, only generate a summary DESCRIPTION.

OBSERVATIONS: In 1978-1979 

In [None]:
# Test 3 with only mistral knowledge

mistral_llm = "mistral-7b-instruct-4k"

get_completion("Where can I find the bigfoot?", model=mistral_llm)

'\n\nBigfoot, also known as Sasquatch, is a legendary creature said to inhabit parts of North America. There is no scientific evidence to support the existence of bigfoot, and many experts consider it a myth. However, if you'

# TASK 4: Bigfoot running

In [None]:
user_query = "bigfoot running"

results = collection.query(
    query_texts=[user_query],
    n_results=10,
)

stories = '\n'.join(results['documents'][0])


prompt_template = f'''[INST]

Your task is to provide a summary DESCRIPTION of how the bigfoot runs, and your description must be based on OBSERVATIONS reports

You should provide a precise and concise answer, and PLEASE DO NOT include the same words that are in OBSERVATIONS, only generate a summary DESCRIPTION.

OBSERVATIONS: {stories}

DESCRIPTION:

[/INST]
'''

responses = get_completion(prompt_template, model=mistral_llm, max_tokens=500)
suggested_titles = ''.join([str(r) for r in responses])

# Print the suggestions.
print("Model Suggestions:")
print(suggested_titles)
print("\n\n\nPrompt Template:")
print(prompt_template)

Model Suggestions:

Based on the observations reported, it appears that bigfoot, also known as Sasquatch, runs in a slow, jogging manner, typically on two legs. They are described as being over 6 feet tall, with long matted hair, heavy set, and a "fat head" and whiskers. Bigfoot have been observed in remote areas, such as Holson Valley Road and behind Boston Mills Ski Resort in the Cuyahoga National Recreation Area. They tend to move stealthily and are often seen running along bike paths or waterways. Some witnesses have described bigfoot as being more manlike than ape-like in appearance, while others have reported seeing a younger, hairier creature that moved rapidly and was more upright. Bigfoot have also been observed hunting deer and other animals.



Prompt Template:
[INST]

Your task is to provide a summary DESCRIPTION of how the bigfoot runs, and your description must be based on OBSERVATIONS reports

You should provide a precise and concise answer, and PLEASE DO NOT include the

In [None]:
# Test 4 with only mistral knowledge

mistral_llm = "mistral-7b-instruct-4k"

get_completion("How does the bigfoot run?", model=mistral_llm)

'\n\nBigfoot, also known as Sasquatch, is a legendary creature said to inhabit parts of North America. There is no scientific evidence to support the existence of bigfoot, and the way it is typically depicted in popular culture, including'

# TASK 5: Bigfoot behaviour

In [None]:
user_query = "bigfoot behaviour"

results = collection.query(
    query_texts=[user_query],
    n_results=10,
)

stories = '\n'.join(results['documents'][0])


prompt_template = f'''[INST]

Your task is to provide a summary DESCRIPTION of the bigfoot behaviour, and your description must be based on OBSERVATIONS reports

You should mimic a similar style and length as OBSERVATIONS, but PLEASE DO NOT include the same words that are in OBSERVATIONS, only generate a summary DESCRIPTION.

OBSERVATIONS: {stories}

DESCRIPTION:

[/INST]
'''

responses = get_completion(prompt_template, model=mistral_llm, max_tokens=500)
suggested_titles = ''.join([str(r) for r in responses])

# Print the suggestions.
print("Model Suggestions:")
print(suggested_titles)
print("\n\n\nPrompt Template:")
print(prompt_template)



Model Suggestions:

The bigfoot behavior in the Norman, Oklahoma area has been characterized by sightings, footprints, vocalizations, and other incidents. In 1976, two sightings were reported and footprints were taken. In January 2003, a sighting was reported by a family member. In 2000, a fisherman reported hearing cries around the Scioto River and in 2003, a man saw what he believed to be a bigfoot near Lake White. The bigfoot is described as being tall, with a slight hunch, and moving stealthily like a kung fu fighter. It is also reported to be intelligent and not afraid of humans. The bigfoot is believed to travel the waterways as we do roads.



Prompt Template:
[INST]

Your task is to provide a summary DESCRIPTION of the bigfoot behaviour, and your description must be based on OBSERVATIONS reports

You should mimic a similar style and length as OBSERVATIONS, but PLEASE DO NOT include the same words that are in OBSERVATIONS, only generate a summary DESCRIPTION.

OBSERVATIONS: My f

In [None]:
# Test 5 with only mistral knowledge

mistral_llm = "mistral-7b-instruct-4k"

get_completion("How does the bigfoot behave?", model=mistral_llm)

'\n\nBigfoot, also known as Sasquatch, is a creature from Native American folklore that is said to inhabit parts of North America. There is no scientific evidence to support the existence of bigfoot, and the behavior of big'