In [2]:
import pandas as pd
import numpy as np

import google.generativeai as genai
import os

In [3]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [68]:
df = pd.read_parquet("data/embedding/Formula 1 Race - 2021 British Grand Prix-ALO.parquet")
# df = pd.read_parquet("temp.parquet")
print(len(df))

df.embedding = df.embedding.map(np.array)

df.head()

5


Unnamed: 0,start,end,text,embedding
0,40.578,43.198,"Three a tick. Yep, that's that one clear.","[0.014806596, -0.013747013, -0.056391202, -0.0..."
1,50.018,52.35,"What the other car did on the front wing, do w...","[0.011937778, -0.017356517, -0.039998524, -0.0..."
2,73.25,80.574,"Okay, so they all said a little bit understeer...","[0.002522055, -0.047273237, -0.033486698, 0.00..."
3,130.69,133.758,"Okay, so just under five minutes to go.","[0.021284627, -0.017589511, -0.037471864, -0.0..."
4,134.978,144.926,Peaceful sound from me. Remind us. Pit to be l...,"[0.0003933713, -0.021510402, -0.035683434, -0...."


In [69]:
embeddings = np.array(df.embedding.tolist())
embeddings.shape

(5, 768)

In [6]:
model = genai.GenerativeModel("gemini-1.5-flash")

In [6]:
model = genai.GenerativeModel("gemini-1.5-flash")

In [59]:
query = "Were there any undercuts?"

query_embedded = np.array(genai.embed_content(
    model="models/text-embedding-004",
    content=query,
    task_type="retrieval_document",
)["embedding"])

query_embedded.shape

(768,)

In [9]:
similarities = embeddings @ query_embedded
order = np.argsort(similarities)[::-1]
order.shape

(144,)

In [64]:
df_relevant = df.iloc[order[:50]]
len(df_relevant)

50

In [65]:
def seconds_to_hours_minutes(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, _ = divmod(remainder, 60)
    return f"{int(hours)}:{int(minutes)}"

transcripts = ""
for i in range(len(df_relevant)):
    transcripts += f"""
    <transcript>
        <start_time>{seconds_to_hours_minutes(df_relevant.iloc[i].start)}</start_time>
        <end_time>{seconds_to_hours_minutes(df_relevant.iloc[i].end)}</end_time>
        <text>{df_relevant.iloc[i].text}</text>
    </transcript>
    """

print(transcripts)


    <transcript>
        <start_time>1:11</start_time>
        <end_time>1:11</end_time>
        <text> Be safe to Magnussen by 8 seconds.</text>
    </transcript>
    
    <transcript>
        <start_time>0:33</start_time>
        <end_time>0:33</end_time>
        <text> Hit that tag too.</text>
    </transcript>
    
    <transcript>
        <start_time>0:12</start_time>
        <end_time>0:12</end_time>
        <text> Sainz 0.8 ahead, Magnussen 0.7 behind.</text>
    </transcript>
    
    <transcript>
        <start_time>0:55</start_time>
        <end_time>0:55</end_time>
        <text> Gasly ahead after 5.5 seconds, he's doing 39.9 on the medium tyre, that's 14 laps left.</text>
    </transcript>
    
    <transcript>
        <start_time>0:13</start_time>
        <end_time>0:13</end_time>
        <text> Pérez now car behind at 1.8 seconds.</text>
    </transcript>
    
    <transcript>
        <start_time>0:2</start_time>
        <end_time>0:2</end_time>
        <text> The other 

In [66]:

prompt = f"""Please provide a concise yet informative response to the following question:
{query}
Guidelines:

* Aim for a balanced answer - neither overly lengthy nor excessively brief
* Focus on directly addressing the question without extensive justification
* Consider the context provided by the relevant radio transcript excerpts below
* Be aware that the transcripts may contain typos or errors - use your judgment to interpret similar words or phrases
* Mention the time for the transcript if needed for the query. Convert the seconds to a human readable format

Relevant transcript excerpts:
{transcripts}
"""

In [67]:
response = model.generate_content(contents=[prompt], stream=True)
for x in response:
    print(x.text, end="")

The transcripts don't explicitly mention undercuts. 


# Testing smart query query generation

In [60]:
prompt = f"""
Act as if you are a Formula 1 Driver and is in constant communication with the team regarding the state of the race and your car. 
Create some sentences that you might say that are relevant to the following query: {query}.

Guidelines:
* Keep the messages short brief and to the point.
* Respond in the form for a python list of strings

"""

In [61]:
response = model.generate_content(contents=[prompt])
sentences = eval(response.text[response.text.index('['):response.text.rfind(']')+1])
sentences

['Box this lap, need to cover that undercut.',
 'Just got undercut by Bottas, need to push hard on this out lap.',
 "Hamilton pitted, I'm holding the lead for now,  but I think there's an undercut coming.",
 "They're trying to undercut me again, let's see if they can pull it off.",
 "We need to anticipate the next undercut, they're playing a dangerous game."]

In [62]:
sentences_embedded = np.array(genai.embed_content(
    model="models/text-embedding-004",
    content=sentences,
    task_type="retrieval_document",
)["embedding"])
sentences_embedded.shape

(5, 768)

In [63]:
similarities = (embeddings @ sentences_embedded.T).max(-1)
order = np.argsort(similarities)[::-1]
order.shape

(144,)