In [1]:
from dotenv import load_dotenv

load_dotenv()
import weaviate
import weaviate.classes as wvc
import os
import requests
import json
from bs4 import BeautifulSoup
from vertexai.generative_models import GenerativeModel, ChatSession
import subprocess
import weaviate
import vertexai

vertexai.init(project=os.getenv("PROJECT_ID"), location=os.getenv("REGION"))

def refresh_token() -> str:
    result = subprocess.run(
        ["gcloud", "auth", "print-access-token"], capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"Error refreshing token: {result.stderr}")
        return None
    return result.stdout.strip()


def re_instantiate_weaviate() -> weaviate.Client:
    token = refresh_token()
    
    client = weaviate.connect_to_local(
        headers={
            "X-Google-Vertex-Api-Key": token,
            "X-Openai-Api-Key": os.getenv("OPENAI_API_KEY"),
        },
    )
    return client


# Run this every ~60 minutes
client = re_instantiate_weaviate()

In [2]:
try:
      client.collections.create(
            name="ufc",
            vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),    # Set the vectorizer to "text2vec-openai" to use the OpenAI API for vector-related operations
            generative_config=wvc.config.Configure.Generative.palm(
                  project_id=os.getenv("PROJECT_ID"),
                  model_id="gemini-1.5-pro-preview-0514"
            )
      )
except Exception as e:
      if e.status_code == 422:
            print("Collection already exists")
      else:
            raise e

Collection already exists


In [15]:
ufc = client.collections.get("ufc")

src_path = "/Users/davidzimberknopf/Documents/Apps/ufc-crawler/data/event/fight"
sources = []

# For each file in the source directory, create a new Source object
for file in os.listdir(src_path):
    with open(os.path.join(src_path, file), "r") as f:
        raw = f.read()
        soup = BeautifulSoup(raw, "html.parser")
        title = soup.find("title")
        body = soup.find("body")
        title_body = (
            (title.text if title is not None else "")
            + (body.text if body is not None else "")
        ).replace("\n", " ")
        if title_body != "":
            ufc.data.insert({"raw": title_body[: 8192 * 4]})


for i in ufc.iterator():
  if i.properties.get("raw") == "":
    ufc.data.delete_by_id(i.uuid)

Josh Parisian v Jamal Pogues Fight Statistics         Sports-Statistics.com      UFC NBA NFL MLB NHL TENNIS                    Topics   Sports Stats  SPORTS STATISTICS UFC Stats (X)   BY SPORT  All SPORTS F1 Darts NBA WNBA NFL NHL MLB Tennis UFC Sports (X)   Sports Betting  All  Sports Betting NBA Odds NFL Odds NHL Odds MLB Odds UFC Odds Odds Calculator How Odds Work (X)   Meta  All Meta Privacy Policy Contact Us (X)                  Home >                       UFC Fight Statistics >                     UFC Fight Night: Andrade vs. Blanchfield  >                       Josh Parisian v Jamal Pogues Fight Statistics       	 			February 18, 2023, Las Vegas, Nevada, USA	 		    			Josh Parisian			 (Loss)   				Jamal Pogues				 (Win)				     Heavyweight Bout  Method: Decision - Unanimous  Round: 3  Time: 5:00  Time Format: 3 Rounds (5-5-5)  Referee: Jacob Montalvo  Details: Decision       Fight Totals      Josh Parisian  Jamal Pogues    0Knockdowns037 of 113Significant Strikes33 of 60113Sign

In [21]:
ufc = client.collections.get("ufc")

# print(ufc.config.get().properties)

for i in ufc.iterator():
  if i.properties.get("raw") == "":
    ufc.data.delete_by_id(i.uuid)

# Get size of collection
# print(sum(1 for i in ufc.iterator()))

# client.collections.delete("ufc")


# for i in ufc.iterator():
#     print(i.properties.get("raw"))

In [4]:
model = GenerativeModel(
    "gemini-1.5-pro-001",
    system_instruction="""
You are an assistant that helps to form nice and human understandable answers.
The information part contains the provided information that you must use to construct an answer.
The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct or answer it.
Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
Do not answer more than the question asks for.
Here is an example:

Question: Which managers own Neo4j stocks?
Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.

If the provided information is empty, say that you don't know the answer.
""",
)

generate_prompt = """
Use the following knowledge to answer the question at the end. 

History: {history}

Context: {raw}

Question: {question}

"""

In [22]:
# instruction for the generative module
question = (
    "What were the last 5 fights? When were they? How long did they last?"
)


ufc = client.collections.get("ufc")
response = ufc.query.near_text(query=question, limit=5)
context = []
for r in response.objects:
    context.append(r.properties)

print(context)

answer = model.generate_content(generate_prompt.format(raw=context, question=question, history=[]))

print(answer.text)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('e934c0fb-f9ae-4833-b3ed-a6ec7a6f5081'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'raw': 'Michael Morales v Jake Matthews Fight Statistics         Sports-Statistics.com      UFC NBA NFL MLB NHL TENNIS                    Topics   Sports Stats  SPORTS STATISTICS UFC Stats (X)   BY SPORT  All SPORTS F1 Darts NBA WNBA NFL NHL MLB Tennis UFC Sports (X)   Sports Betting  All  Sports Betting NBA Odds NFL Odds NHL Odds MLB Odds UFC Odds Odds Calculator How Odds Work (X)   Meta  All Meta Privacy Policy Contact Us (X)                  Home >                       UFC Fight Statistics >                     UFC Fight Night: Allen vs. Craig  >                       Michael Morales v Jake Matthews Fight Statistics       \t \t\t\tNovember 18, 2023, Las Vegas, Nevada, USA\t \t\t    \t\t\tMichael Morales\t\t\t (Win

In [8]:
# instruction for the generative module
question = (
    "How many takedowns were in the 'UFC Fight Night: Tuivasa vs. Tybura' event?"
)


ufc = client.collections.get("ufc")
response = ufc.query.near_text(query=question, limit=5)

context = []
for r in response.objects:
    context.append(r.properties.get("raw"))

answer = model.generate_content(generate_prompt.format(raw=context, question=question, history=[]))

print(answer.text)

There was 1 takedown in the fight. 



In [9]:
# instruction for the generative module
question1 = "Who is Salsa Boy?"


ufc = client.collections.get("ufc")
response1 = ufc.query.near_text(query=question1, limit=5)

context1 = [r.properties.get("raw") for r in response1.objects]

answer1 = model.generate_content(
    generate_prompt.format(raw=context1, question=question1, history=[])
)

print(answer1.text)

question2 = "How many takedown attempts did he have in all fights?"

response2 = ufc.query.near_text(query=question2, limit=5)

context2 = [r.properties.get("raw") for r in response2.objects]

answer2 = model.generate_content(
    generate_prompt.format(
        raw=context2, question=question2, history=[[question1, answer1.text]]
    )
)

print(answer2.text)

Waldo Cortes-Acosta's nickname is "Salsa Boy". 

I'm sorry, but I cannot provide an answer to your question about how many takedown attempts a specific fighter had in all fights using the provided text. 

