Pipeline 3. The Video Expert

copyright 2024, Denis Rothman

**Notebook summary:**

* Step 1: The Pinecone index
* Step 2: Defining the RAG functions
* Step 3: Querying the vector store for RAG
* Step 4: Retrieval Augmented Generation



# Installing the environment

## Importing modules and libraries

In [None]:
from IPython.display import HTML # to display videos
import base64 # to encode videos as base64
from base64 import b64encode # to encode videos as base64
import os # to interact with the operating system
import subprocess # to run commands
import time # to measure execution time
import csv # to save comments
import uuid # to generate unique ids
import cv2 # to split videos
from PIL import Image # to display videos
import pandas as pd # to display comments
import numpy as np # to use Numerical Python
from io import BytesIO #for a binary stream of data in memory

## GitHub

In [None]:
def download(directory, filename):
    # The base URL of the image files in the GitHub repository
    base_url = 'https://raw.githubusercontent.com/Denis2054/RAG-Driven-Generative-AI/main/'

    # Complete URL for the file
    file_url = f"{base_url}{directory}/{filename}"

    # Use curl to download the file, including an Authorization header for the private token
    try:
        # Prepare the curl command with the Authorization header
        # PRIVATE_TOKEN will be removed at publication
        PRIVATE_TOKEN = "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"
        curl_command = f'curl -H "Authorization: token {PRIVATE_TOKEN}" -o {filename} {file_url}'

        # Execute the curl command
        subprocess.run(curl_command, check=True, shell=True)
        print(f"Downloaded '{filename}' successfully.")
    except subprocess.CalledProcessError:
        print(f"Failed to download '{filename}'. Check the URL, your internet connection and the file path")

## OpenAI

In [None]:
#You can retrieve your API key from a file(1)
# or enter it manually(2)
#Comment this cell if you want to enter your key manually.

#(1)Retrieve the API Key from a file
#Store you key in a file and read it(you can type it directly in the notebook but it will be visible for somebody next to you)
from google.colab import drive
drive.mount('/content/drive')
f = open("drive/MyDrive/files/api_key.txt", "r")
API_KEY=f.readline()
f.close()

Mounted at /content/drive


In [None]:
try:
  import openai
except:
  #!pip install openai==0.28.0
  !pip install openai==1.33.0
  import openai

Collecting openai==1.33.0
  Downloading openai-1.33.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai==1.33.0)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai==1.33.0)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai==1.33.0)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully insta

In [None]:
#(2) Enter your manually by
# replacing API_KEY by your key.
#The OpenAI Key
os.environ['OPENAI_API_KEY'] =API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

## Pinecone

In [None]:
!pip install pinecone-client==4.1.1

Collecting pinecone-client==4.1.1
  Downloading pinecone_client-4.1.1-py3-none-any.whl (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.2/216.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client==4.1.1)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-client
Successfully installed pinecone-client-4.1.1 pinecone-plugin-interface-0.0.7


In [None]:
import pinecone

In [None]:
f = open("drive/MyDrive/files/pinecone.txt", "r")
PINECONE_API_KEY=f.readline()
f.close()

# Step 1: The Pinecone index

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'

from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
from pinecone import ServerlessSpec

index_name = 'videos-sports-5'
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [None]:
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 823}},
 'total_vector_count': 823}

# Step 2: Defining the RAG functions

## Embedding

In [None]:
# Specify the embedding model
embedding_model = "text-embedding-ada-002"

# Initialize the OpenAI client
client = openai.OpenAI()

# Define the function to get embeddings using the specified model
def get_embedding(text, model=embedding_model):
    # Ensure the text is a string and replace newline characters with spaces
    text = str(text).replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    embedding = response.data[0].embedding
    return embedding

## Query Pincone

In [None]:
def query_pinecone(query_text,k):
  query_embedding = get_embedding(query_text, model=embedding_model)
  # Perform the query using the embedding
  query_results = index.query(vector=query_embedding, top_k=k, include_metadata=True)  # Request metadata
  return query_results

## Process result of query

In [None]:
def collect_query_results(query_results):
    results = []
    for match in query_results['matches']:
        # Prepare the result dictionary for each match
        result = {
            "ID": match['id'],
            "Score": match['score']
        }

        # Check if metadata is available and add to result dictionary
        if 'metadata' in match:
            metadata = match['metadata']
            result['Text'] = metadata.get('text', "No text metadata available.")
            result['Frame Number'] = metadata.get('frame_number', "No frame number available.")
            result['File Name'] = metadata.get('file_name', "No file name available.")
        else:
            result['Text'] = "No metadata available."
            result['Frame Number'] = "No metadata available."
            result['File Name'] = "No metadata available."

        results.append(result)

    return results

## Augmented Retrievel Generation

In [None]:
from openai import OpenAI

def get_openai_response(prompt):
    # Initialize the client
    client = OpenAI()

    # Create a completion request to OpenAI
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                "role": "system",
                "content": "You will be provided with comments of an image frame taken from a video. Analyze the text and 1. Point out the cognitive dissonances 2. Rewrite the comment in a logical engaging style. 3. Provide a label for this image such as Label: basketball, football, soccer or other label."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    # Return the content of the response
    return response.choices[0].message.content

## Download video

The download function is in the *GitHub* subsection of *Installing the environement* of this notebook.   

It will be called by the Vector Store Administrator in the *Administrator-Pipeline 1* section of this notebook.

## Display video

In [None]:
# Open the file in binary mode
def display_video(file_name):
  with open(file_name, 'rb') as file:
      video_data = file.read()

  # Encode the video file as base64
  video_url = b64encode(video_data).decode()

  # Create an HTML string with the embedded video
  html = f'''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{video_url}" type="video/mp4">
  Your browser does not support the video tag.
  </video>
  '''
  # Display the video
  HTML(html)
  # Return the HTML object
  return HTML(html)

In [None]:
def display_video_frame(file_name, frame_number=0, size=(100, 110)):
    # Open the video file
    cap = cv2.VideoCapture(file_name)

    # Move to the frame_number
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)

    # Read the frame
    success, frame = cap.read()
    if not success:
        return "Failed to grab frame"

    # Convert the color from BGR to RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Convert to PIL image and resize
    img = Image.fromarray(frame)
    img = img.resize(size, Image.ANTIALIAS)  # Resize image to specified size

    # Convert the PIL image to a base64 string to embed in HTML
    buffered = BytesIO()
    img.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode()

    # Create an HTML string with the embedded image
    html_str = f'''
    <img src="data:image/jpeg;base64,{img_str}" width="{size[0]}" height="{size[1]}">
    '''
    # Display the image
    display(HTML(html_str))
    # Return the HTML object for further use if needed
    return HTML(html_str)


## Download frame

The download function is in the *GitHub* subsection of *Installing the environement* of this notebook.   

It will be called by the Vector Store Administrator in the *Administrator-Pipeline 1* section of this notebook.

## Display frame

In [None]:
import os
from IPython.display import Image, display

def display_frame(frame):
    # Specify the directory and file name
    directory = '/content/'  # Adjust the directory if needed
    file_path = os.path.join(directory, frame)

    # Check if the file exists and verify its size
    if os.path.exists(file_path):
        file_size = os.path.getsize(file_path)
        print(f"File '{frame}' exists. Size: {file_size} bytes.")

        # Define a logical size value in bytes, for example, 1000 bytes
        logical_size = 1000  # You can adjust this threshold as needed

        if file_size > logical_size:
            print("The file size is greater than the logical value.")
            display(Image(filename=file_path))
        else:
            print("The file size is less than or equal to the logical value.")
    else:
        print(f"File '{frame}' does not exist in the specified directory.")

## Step 3 : Querying the vector store

In [None]:
k=1 # number of results

In [None]:
#query_text = "Find a basketball player."
query_text = "Find a basketball player that is scoring with a dunk."
#query_text = "Find a female soccer player that is playing."
query_results = query_pinecone(query_text,k)

In [None]:
# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries

ID=f104138b-0be8-4f4c-bf99-86d0eb34f7ee
score=0.866193652
text=In this image, there is a person who appears to be in the process of executing a dunk in basketball. The individual is airborne, with one arm extended upwards towards the basketball hoop, holding a basketball in hand, preparing to slam it through the hoop. The word "dunk" is superimposed on the image, confirming the action taking place. The background shows clear skies and a modern building, suggesting this might be an outdoor basketball court in an urban setting. The player is wearing athletic wear and a pair of basketball shoes, suitable for the sport. The dynamic posture and the context indicate an athletic and powerful movement, typical of a basketball dunk.
frame_number=191
file_name=basketball3.mp4



In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# process frame
file_name_root = file_name.split('.')[0]
frame="frame_"+str(frame)+".jpg"
print(frame)
directory = "Chapter10/frames/"+file_name_root
print(directory)
download(directory,frame)
display_frame(frame)

## Step 4: Retrieval Augmented Generation

In [None]:
prompt=text

In [None]:
response_content = get_openai_response(prompt)
print(response_content)

1. Cognitive Dissonances:
   - The comment mentions "clear skies" and a "modern building" in the background, which might be irrelevant to the main action of the dunk.
   - The word "dunk" being superimposed on the image is redundant since the description already clarifies the action.

2. Rewritten Comment:
   In this image, a basketball player is captured mid-air, executing a powerful dunk. With one arm extended towards the hoop and the basketball firmly in hand, the athlete is poised to slam the ball through the net. The player's athletic wear and basketball shoes highlight their readiness for the sport. The urban outdoor court setting adds to the dynamic and energetic atmosphere of the scene.

3. Label: Basketball


# Evaluator

## Installing the similarity score packages and defining the functions

Install the package(s) that fit your project.

In [None]:
!pip install spacy



In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

'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'

In [None]:
!pip install sentence-transformers==3.0.1

Collecting sentence-transformers==3.0.1
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (fro

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_cosine_similarity_with_embeddings(text1, text2):
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    similarity = cosine_similarity([embeddings1], [embeddings2])
    return similarity[0][0]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import spacy

def spacy_similarity(text1, text2):
    # Load the medium-sized English model
    nlp = spacy.load('en_core_web_md')

    # Process the texts
    doc1 = nlp(text1)
    doc2 = nlp(text2)

    # Calculate semantic similarity
    similarity_score = doc1.similarity(doc2)

    return similarity_score

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])
    return similarity[0][0]

In [None]:
text1 = " In this image, a basketball player is captured mid-air, executing a powerful dunk. With one arm extended towards the hoop and the basketball firmly in hand, the athlete is poised to slam the ball through the net. The player's athletic wear and basketball shoes highlight their readiness for the sport. The urban outdoor court setting adds to the dynamic and energetic atmosphere of the scene."
text2 = "In this image, a basketball player is shown making a super cool dunk in mid-air."
similarity_score1 = calculate_cosine_similarity(text1, text2)
print(f"Cosine Similarity Score with sklearn: {similarity_score1:.3f}")

similarity_score2 = spacy_similarity(text1, text2)
print(f"Semantic Similarity Score with spaCy: {similarity_score2:.3f}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")

similarity_score4 = 0.75
print(f"Cosine Similarity Score with human feedback: {similarity_score4:.3f}")

Cosine Similarity Score with sklearn: 0.201
Semantic Similarity Score with spaCy: 0.862
Cosine Similarity Score with sentence transformer: 0.797
Cosine Similarity Score with human feedback: 0.750


In [None]:
def extract_rewritten_comment(response):
    """
    Extracts the rewritten comment from GPT-4o response.
    """
    lines = response.split('\n')
    rewritten_comment = []
    rewrite_started = False
    for line in lines:
        if "Rewritten Comment:" in line:
            rewrite_started = True
            continue
        if rewrite_started:
            if line.strip() == "":
                break
            rewritten_comment.append(line.strip())
    return " ".join(rewritten_comment)

## Examples

In [None]:
import numpy as np
import sys
# create an empty array score for the query score:
rscores =[]

# create an empty score for  similarity scores
scores=[]

## 1

In [None]:
query_text = "Find a female soccer player that is playing."
# Capture the output
import io
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=1c48c477-041e-4d4c-9374-4f8b76a580b8
score=0.868244231
text=It appears that there are a few people engaged in a casual game of soccer on a grass field. One individual is in possession of the ball, dribbling it with his foot while the others are around him, possibly playing defense or waiting for a pass. They appear to be dressed for athletic activity, suggesting this is a recreational game amongst friends or acquaintances. There is a superimposed text "female" which seems out of context and may have been added post-capture, it doesn't correspond to the activity displayed in the picture.
frame_number=91
file_name=soccer_pass.mp4

1. Cognitive Dissonances:
   - The superimposed text "female" does not align with the description of the activity, which is a casual game of soccer.
   - The text "female" is out of context and seems irrelevant to the described scene.

2. Rewritten Comment:
   "A group of people are playing a casual game of soccer on a grass field. One player is dribbling th

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
print("Displaying video: ",file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This image shows soccer players on a field dribbling and passing the ball."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This image shows soccer players on a field dribbling and passing the ball.
Rewritten Comment: "A group of people are playing a casual game of soccer on a grass field. One player is dribbling the ball while others are either defending or waiting for a pass. They are dressed in athletic attire, indicating this is a recreational game among friends or acquaintances. Interestingly, there is a superimposed text 'female' that seems unrelated to the activity shown in the image."
Cosine Similarity Score with sentence transformer: 0.614


## 2

In [None]:
query_text = "Find a basketball player scoring with a slam dunk."
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=aa7c6c4f-62ff-4166-ba2e-6623a07cd645
score=0.880875826
text=In the image, there is a person performing a slam dunk in basketball. The person is airborne, near the peak of their jump, with one arm extended towards the basketball hoop to score. The word "slam" visible in the image suggests a focus on the slam dunk action. The background features clear skies and a tall building, indicating the setting is likely an outdoor basketball court.
frame_number=179
file_name=basketball3.mp4

1. Cognitive Dissonances:
   - The mention of "clear skies" and "tall building" might be unnecessary and distract from the main action of the slam dunk.
   - The word "slam" being visible in the image is an unusual detail that might not be relevant unless it is part of a graphic or text overlay.

2. Rewritten Comment:
   In the image, a person is performing an impressive slam dunk in basketball. They are airborne at the peak of their jump, with one arm extended towards the hoop, ready to score. The action i

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This image shows an incredible dunk by a basketball player."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This image shows an incredible dunk by a basketball player.
Rewritten Comment: In the image, a person is performing an impressive slam dunk in basketball. They are airborne at the peak of their jump, with one arm extended towards the hoop, ready to score. The action is dynamic and captures the excitement of the game, set against the backdrop of an outdoor basketball court.
Cosine Similarity Score with sentence transformer: 0.840


## 3

In [None]:
query_text = "Find a skiier skiing down a slope."
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=91ce7635-8621-4d85-b38e-27171b6a43fe
score=0.899943471
text=In the image, there are four skiers going down a snowy slope. The trails left by skiers are visible in the snow, and the word "down" is overlayed in large letters, indicating the direction of movement. It appears to be a sunny day on the ski slope, and the skiers seem to be enjoying their descent.
frame_number=23
file_name=skiing1.mp4

1. Cognitive Dissonances:
   - The word "down" is mentioned as being overlayed in large letters, which is unusual for a natural scene and might be part of a video effect or text overlay.
   - The comment mentions the skiers "seem to be enjoying their descent," which is an assumption and not directly observable from the image.

2. Rewritten Comment:
   "In this image, four skiers are captured gliding down a snowy slope, leaving distinct trails behind them. The word 'down' is prominently displayed, emphasizing their downward movement. The bright, sunny weather adds to the vibrant atmosphere of 

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This image shows people skiing down a slope on white snow."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This image shows people skiing down a slope on white snow.
Rewritten Comment: "In this image, four skiers are captured gliding down a snowy slope, leaving distinct trails behind them. The word 'down' is prominently displayed, emphasizing their downward movement. The bright, sunny weather adds to the vibrant atmosphere of the ski slope, suggesting a pleasant experience for the skiers."
Cosine Similarity Score with sentence transformer: 0.721


## 4

In [None]:
query_text = "Show a hockey player on the ice rink."
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=f72ae1d3-8811-41b4-b2c4-83f05f44d3de
score=0.8942855
text=This image shows a hockey player in action on the ice. The player is equipped with ice hockey gear, including a helmet, gloves, and a jersey, and is in the process of handling or passing the puck with their hockey stick. The word "puck" is superimposed on the image, likely indicating the object that the player is interacting with. The environment appears to be an indoor ice rink, and the motion in the image suggests a dynamic sporting moment.
frame_number=59
file_name=hockey1.mp4

1. Cognitive Dissonances:
   - The comment redundantly mentions that the player is equipped with ice hockey gear, which is already implied by the context of a hockey game.
   - The phrase "handling or passing the puck" is ambiguous and could be more specific.
   - The mention of the word "puck" being superimposed on the image is somewhat extraneous and could be integrated more smoothly.

2. Rewritten Comment:
   This image captures a dynamic moment 

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This image shows a hokey player pushing the puck with his hockey stick."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This image shows a hokey player pushing the puck with his hockey stick.
Rewritten Comment: This image captures a dynamic moment of a hockey player in action on the ice. Fully equipped with a helmet, gloves, and a jersey, the player is skillfully handling the puck with their hockey stick. The scene takes place in an indoor ice rink, highlighting the intensity and excitement of the sport.
Cosine Similarity Score with sentence transformer: 0.764


## 5

In [None]:
query_text = "Show somebody walking in a forest."
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=2aebbe86-99e5-46e1-b81c-5aba77ccceb2
score=0.898504853
text=In this image, we see a person in a forested area. They are dressed in outdoor clothing, including a light-colored jacket and bright orange pants, and they appear to be walking or hiking through the woods. The person seems to be stepping over a log or uneven terrain while navigating through the natural environment, which is dense with trees and underbrush. There is a word "the" superimposed on the image, which looks like a part of some text that is not fully visible, maybe from a video caption or overlay.
frame_number=243
file_name=walking1.mp4

1. Cognitive Dissonances:
   - The comment mentions a word "the" superimposed on the image, which seems out of place and not directly related to the description of the person or the forested area.
   - The description of the person's activity (walking or hiking) is somewhat vague and could be more specific.

2. Rewritten Comment:
   "In this image, we see a person navigating through

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This image shows a person walking through a forest with boots."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This image shows a person walking through a forest with boots.
Rewritten Comment: "In this image, we see a person navigating through a dense forest. They are dressed in outdoor gear, including a light-colored jacket and bright orange pants, suggesting they are prepared for a hike. The individual is carefully stepping over a log, indicating the challenging terrain of the natural environment. There is a partially visible word 'the' superimposed on the image, likely part of a video caption or overlay."
Cosine Similarity Score with sentence transformer: 0.677


## 6

In [None]:
query_text = "Show a surfer."
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=c4f2d0ae-cecf-42ad-8872-e6efddde8560
score=0.881048203
text=In this image, a person is engaged in the sport of surfing. They are wearing a red sleeve, which is partially visible, and are lying prone on a surfboard while riding a wave. The perspective is from the water level, capturing the surfer and the wave at a moment where the surfer appears to be paddling or just starting to stand up. The specific moment caught in this photo gives a sense of motion and the dynamic nature of surfing. There is text superimposed on the image that reads "this," which doesn't provide information about the scene but might be part of a larger sentence or context that isn't visible in the photo. The ocean and the wave make up the background, adding to the overall surfing ambiance.
frame_number=23
file_name=surfer2.mp4

1. Cognitive Dissonances:
   - The comment mentions the person is "engaged in the sport of surfing" but also states they are "lying prone on a surfboard," which could imply they are paddl

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This is a cool image of a surfer on a big wave."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This is a cool image of a surfer on a big wave.
Rewritten Comment: "In this image, a surfer is captured in the midst of their sport, wearing a red sleeve that is partially visible. The surfer is lying prone on their surfboard, either paddling or preparing to stand up, as they ride a wave. The perspective from water level emphasizes the dynamic motion and excitement of surfing. The ocean and the wave form a vibrant backdrop, enhancing the surfing ambiance. There is text superimposed on the image that reads 'this,' which seems to be part of a larger, unseen context."
Cosine Similarity Score with sentence transformer: 0.675


## 7

In [None]:
query_text = "Show a person who is swimming."
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=9cc43b39-8fd1-4d28-9540-03c33cede053
score=0.914937079
text=The image shows a person swimming in a pool. They are wearing a white swim cap and pink goggles, and the water is visibly rippling around them as they move. The word "swimmer" is overlaid on the image, indicating that the person is engaged in the activity of swimming. It appears to be an action shot capturing the swimmer in motion.
frame_number=11
file_name=swimming1.mp4

1. Cognitive Dissonances:
   - The comment is generally consistent and does not contain significant cognitive dissonances. However, the phrase "indicating that the person is engaged in the activity of swimming" is redundant given the context.

2. Rewritten Comment:
   The image captures a swimmer in action, gliding through the pool with visible ripples in the water. The swimmer is wearing a white swim cap and pink goggles, emphasizing their focus and readiness. The word "swimmer" is overlaid on the image, reinforcing the dynamic nature of the scene.

3. La

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This image shows a good swimmer swimming fast."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This image shows a good swimmer swimming fast.
Rewritten Comment: The image captures a swimmer in action, gliding through the pool with visible ripples in the water. The swimmer is wearing a white swim cap and pink goggles, emphasizing their focus and readiness. The word "swimmer" is overlaid on the image, reinforcing the dynamic nature of the scene.
Cosine Similarity Score with sentence transformer: 0.715


## 8

In [None]:
query_text = "Show a person jogging."
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=bdae5e65-a90e-4c4f-98d8-e8f806836078
score=0.912609458
text=In the image, a person is shown in mid-stride, seemingly engaged in outdoor physical exercise, such as jogging or running. The image is blurry, so details are not sharply defined, but it conveys a sense of motion and activity.
frame_number=23
file_name=jogging2.mp4

1. Cognitive Dissonances:
   - The comment mentions that the person is "seemingly engaged in outdoor physical exercise," which implies uncertainty, yet it also states "such as jogging or running," which suggests a specific activity.
   - The image is described as blurry, which makes it difficult to definitively identify the activity, yet the comment still attempts to specify the type of exercise.

2. Rewritten Comment:
   "The image captures a person in mid-stride, engaged in what appears to be an outdoor physical activity. Although the image is blurry and lacks sharp details, it effectively conveys a sense of motion and energy, suggesting the person might be jo

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This image shows more than one person running and jogging."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This image shows more than one person running and jogging.
Rewritten Comment: "The image captures a person in mid-stride, engaged in what appears to be an outdoor physical activity. Although the image is blurry and lacks sharp details, it effectively conveys a sense of motion and energy, suggesting the person might be jogging or running."
Cosine Similarity Score with sentence transformer: 0.707


## 9

In [None]:
query_text = "Show a person with a blue helmet climbing the rock of a mountain."
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=047b9bb7-3ad6-4008-96d0-7bc4f894d882
score=0.88254416
text=The image shows a person in a forested area stepping over a gap between rocks or fallen trees. The person is wearing a white hat, light-colored jacket, and bright orange pants, with hiking boots, which suggests they may be out for a hike or walk in nature. They appear to be carefully crossing an uneven and potentially slippery section of the terrain, using their arms for balance. The forest setting seems lush and green, indicating that it could be during a season when the vegetation is thriving, possibly spring or summer.
frame_number=295
file_name=walking1.mp4

1. Cognitive Dissonances:
   - The comment mentions both "rocks" and "fallen trees" as the gap the person is stepping over, which could be confusing.
   - The description of the person's attire is detailed but could be more concise.
   - The season is speculated to be either spring or summer, which is somewhat redundant.

2. Rewritten Comment:
   The image captures a

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This image shows soccer players on a field dribbling and passing the ball."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This image shows soccer players on a field dribbling and passing the ball.
Rewritten Comment: The image captures a person navigating a forested area, carefully stepping over a gap between rocks or fallen trees. Dressed in a white hat, light-colored jacket, and bright orange pants, they use their arms for balance while wearing sturdy hiking boots. The lush, green surroundings suggest the scene takes place in a thriving season, likely spring or summer.
Cosine Similarity Score with sentence transformer: 0.294


## 10

In [None]:
import sys
query_text = "Show a ball passing a goal keepeer."
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
query_results = query_pinecone(query_text,1) # query, k
# Restore stdout
sys.stdout = old_stdout

# Call the function
collected_results = collect_query_results(query_results)

# Print results after calling the function
for result in collected_results:
  id= result['ID']
  score= result['Score']
  text= result['Text']
  frame= result['Frame Number']
  file_name= result['File Name']
  print(f"ID={id}")
  print(f"score={score}")
  print(f"text={text}")
  print(f"frame_number={frame}")
  print(f"file_name={file_name}")
  print()  # Add a newline for better readability between entries
response_content = get_openai_response(text)
print(response_content)

ID=904c9430-ef36-4573-89cf-986b9721f9b8
score=0.874724627
text=In this image, there is a classic black and white soccer ball resting against the netting of a goal. The word "goal" is prominently displayed across the middle of the image, suggesting that a goal has been scored in a game of soccer (also known as football in many countries around the world). The image captures the common scene that follows the ball crossing the goal line, which is an event celebrated by the scoring team in the sport.
frame_number=99
file_name=female_player_after_scoring.mp4

1. Cognitive Dissonances:
   - The comment redundantly explains that soccer is also known as football in many countries, which is common knowledge for most readers.
   - The phrase "the common scene that follows the ball crossing the goal line" is somewhat redundant and could be more concisely stated.

2. Rewritten Comment:
   This image captures a classic moment in soccer: a black and white ball nestled against the net, signifying a g

In [None]:
# process video
directory = "Chapter10/videos"
download(directory,file_name)
display_video(file_name)

In [None]:
# Human feedback flashcard comment
text1 = "This image shows a team of female soccer players scoring a goal."

# Extract rewritten comment
text2 = extract_rewritten_comment(response_content)

print(f"Human Feedback Comment: {text1}")
print(f"Rewritten Comment: {text2}")

similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
rscores.append(score)

Human Feedback Comment: This image shows a team of female soccer players scoring a goal.
Rewritten Comment: This image captures a classic moment in soccer: a black and white ball nestled against the net, signifying a goal. The word "goal" prominently displayed across the image highlights the excitement of scoring, a moment celebrated by players and fans alike.
Cosine Similarity Score with sentence transformer: 0.653


## Metrics

In [None]:
print(len(scores), scores)
print(len(rscores), rscores)

10 [0.6135231, 0.8399164, 0.7212182, 0.7638879, 0.6767974, 0.6750457, 0.7148576, 0.7072991, 0.29382104, 0.6525482]
10 [0.868244231, 0.880875826, 0.899943471, 0.8942855, 0.898504853, 0.881048203, 0.914937079, 0.912609458, 0.88254416, 0.874724627]


In [None]:
# Calculating metrics
mean_score = np.mean(scores)
median_score = np.median(scores)
std_deviation = np.std(scores)
variance = np.var(scores)
min_score = np.min(scores)
max_score = np.max(scores)
range_score = max_score - min_score
percentile_25 = np.percentile(scores, 25)
percentile_75 = np.percentile(scores, 75)
iqr = percentile_75 - percentile_25

# Printing the metrics with 2 decimals
print(f"Mean: {mean_score:.2f}")
print(f"Median: {median_score:.2f}")
print(f"Standard Deviation: {std_deviation:.2f}")
print(f"Variance: {variance:.2f}")
print(f"Minimum: {min_score:.2f}")
print(f"Maximum: {max_score:.2f}")
print(f"Range: {range_score:.2f}")
print(f"25th Percentile (Q1): {percentile_25:.2f}")
print(f"75th Percentile (Q3): {percentile_75:.2f}")
print(f"Interquartile Range (IQR): {iqr:.2f}")

Mean: 0.67
Median: 0.69
Standard Deviation: 0.14
Variance: 0.02
Minimum: 0.29
Maximum: 0.84
Range: 0.55
25th Percentile (Q1): 0.66
75th Percentile (Q3): 0.72
Interquartile Range (IQR): 0.06


In [None]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Convert lists to NumPy arrays
scores = np.array(scores)
rscores = np.array(rscores)

# Ensure both arrays have the same length
assert len(scores) == len(rscores), "Length of scores and rscores must be equal"

# Define threshold
threshold = 0.6

# Generate binary labels based on the threshold
true_labels = (rscores > threshold).astype(int)
predicted_labels = (scores > threshold).astype(int)

# Calculate metrics
f1 = f1_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
accuracy = accuracy_score(true_labels, predicted_labels)

# Print the metrics with 2 decimals
print(f"F1 Score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Accuracy: {accuracy:.2f}")

F1 Score: 0.95
Precision: 1.00
Recall: 0.90
Accuracy: 0.90
