In [3]:
%pip install weaviate-client
from dotenv import load_dotenv

load_dotenv()
import weaviate
import weaviate.classes as wvc
import os
import requests
import json
from bs4 import BeautifulSoup
from vertexai.generative_models import GenerativeModel, ChatSession
import subprocess
import weaviate
import vertexai

vertexai.init(project=os.getenv("PROJECT_ID"), location=os.getenv("REGION"))

def refresh_token() -> str:
    result = subprocess.run(
        ["gcloud", "auth", "print-access-token"], capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"Error refreshing token: {result.stderr}")
        return None
    return result.stdout.strip()


def re_instantiate_weaviate() -> weaviate.Client:
    token = refresh_token()
    
    client = weaviate.connect_to_local(
        headers={
            "X-Google-Vertex-Api-Key": token,
            "X-Openai-Api-Key": os.getenv("OPENAI_API_KEY"),
        },
    )
    return client


# Run this every ~60 minutes
client = re_instantiate_weaviate()

  pid, fd = os.forkpty()



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
try:
      client.collections.create(
            name="ufc",
            vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),    # Set the vectorizer to "text2vec-openai" to use the OpenAI API for vector-related operations
            generative_config=wvc.config.Configure.Generative.palm(
                  project_id=os.getenv("PROJECT_ID"),
                  model_id="gemini-1.5-pro-preview-0514"
            )
      )
except Exception as e:
      if e.status_code == 422:
            print("Collection already exists")
      else:
            raise e

In [6]:
ufc = client.collections.get("ufc")

src_path = "data/fight"
sources = []

# For each file in the source directory, create a new Source object
for file in os.listdir(src_path):
    with open(os.path.join(src_path, file), "r") as f:
        raw = f.read()
        soup = BeautifulSoup(raw, "html.parser")
        title = soup.find("title")
        body = soup.find("body")
        title_body = (
            (title.text if title is not None else "")
            + (body.text if body is not None else "")
        ).replace("\n", " ")
        if title_body != "":
            ufc.data.insert({"raw": title_body[: 8192 * 4]})


for i in ufc.iterator():
  if i.properties.get("raw") == "":
    ufc.data.delete_by_id(i.uuid)

In [7]:
ufc = client.collections.get("ufc")

# print(ufc.config.get().properties)

for i in ufc.iterator():
  if i.properties.get("raw") == "":
    ufc.data.delete_by_id(i.uuid)

# Get size of collection
# print(sum(1 for i in ufc.iterator()))

# client.collections.delete("ufc")


# for i in ufc.iterator():
#     print(i.properties.get("raw"))

In [8]:
model = GenerativeModel(
    "gemini-1.5-pro-001",
    system_instruction="""
You are an assistant that helps to form nice and human understandable answers.
The information part contains the provided information that you must use to construct an answer.
The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct or answer it.
Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
Do not answer more than the question asks for.
Here is an example:

Question: Which managers own Neo4j stocks?
Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.

If the provided information is empty, say that you don't know the answer.
""",
)

generate_prompt = """
Use the following knowledge to answer the question at the end. 

History: {history}

Context: {raw}

Question: {question}

"""

In [10]:
# instruction for the generative module
question = (
    "What were the last 5 fights? When were they? How many rounds did they have?"
)


ufc = client.collections.get("ufc")
response = ufc.query.near_text(query=question, limit=5)
context = []
for r in response.objects:
    context.append(r.properties)


context_size = len(context)
context_chars = sum([len(i["raw"]) for i in context])

print("Context size: ", str(context_size))
print("Context characters: ", str(context_chars))

answer = model.generate_content(generate_prompt.format(raw=context, question=question, history=[]))

print(answer.text)

Context size:  5
Context characters:  13327




The last 5 fights were:  Michael Morales defeated Jake Matthews on November 18, 2023 in a 3 round fight. Morgan Charriere lost to Chepe Mariscal on April 6, 2024 in a 3 round fight. Bryan Battle lost to Ange Loosa on March 16, 2024 in a 2 round fight. Bryan Battle defeated AJ Fletcher on September 23, 2023 in a 2 round fight. Morgan Charriere defeated Manolo Zecchini on September 2, 2023 in a 1 round fight. 



In [11]:
# instruction for the generative module
question = (
    "How many takedowns did Alexandre Pantoja have in all fights?"
)


ufc = client.collections.get("ufc")
response = ufc.query.near_text(query=question, limit=5)
context = []
for r in response.objects:
    context.append(r.properties)

context_size = len(context)
context_chars = sum([len(i["raw"]) for i in context])

print("Context size: ", str(context_size))
print("Context characters: ", str(context_chars))

answer = model.generate_content(generate_prompt.format(raw=context, question=question, history=[]))

print(answer.text)

Context size:  5
Context characters:  16685
Alexandre Pantoja had 17 takedowns in all fights. 



In [12]:
# instruction for the generative module
question1 = "Who is Salsa Boy?"


ufc = client.collections.get("ufc")
response1 = ufc.query.near_text(query=question1, limit=5)

context1 = [r.properties.get("raw") for r in response1.objects]

answer1 = model.generate_content(
    generate_prompt.format(raw=context1, question=question1, history=[])
)

context_size = len(context1)
context_chars = sum([len(i) for i in context1])

print("Context size: ", str(context_size))
print("Context characters: ", str(context_chars))

print(answer1.text)

Context size:  5
Context characters:  13848
Waldo Cortes-Acosta's nickname is "Salsa Boy". 

