In [None]:
!pip install google-cloud-videointelligence

Collecting google-cloud-videointelligence
  Downloading google_cloud_videointelligence-2.15.0-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading google_cloud_videointelligence-2.15.0-py2.py3-none-any.whl (269 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/269.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m266.2/269.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.8/269.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-videointelligence
Successfully installed google-cloud-videointelligence-2.15.0


In [5]:
from datetime import timedelta
from typing import Optional, Sequence, cast

from google.cloud import videointelligence_v1 as vi

import os

In [6]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/content/perfect-eon-449007-i8-8d3c769375fd.json'

In [7]:
def detect_labels( video_uri: str,mode: vi.LabelDetectionMode,segments: Optional[Sequence[vi.VideoSegment]] = None,) -> vi.VideoAnnotationResults:
    video_client = vi.VideoIntelligenceServiceClient()
    features = [vi.Feature.LABEL_DETECTION]
    config = vi.LabelDetectionConfig(label_detection_mode=mode)
    context = vi.VideoContext(segments=segments, label_detection_config=config)
    request = vi.AnnotateVideoRequest(
        input_uri=video_uri,
        features=features,
        video_context=context,
    )

    print(f'Processing video "{video_uri}"...')
    operation = video_client.annotate_video(request)

    # Wait for operation to complete
    response = cast(vi.AnnotateVideoResponse, operation.result())
    # A single video is processed
    results = response.annotation_results[0]

    return results

video_uri = "gs://cloud-samples-data/video/JaneGoodall.mp4"
mode = vi.LabelDetectionMode.SHOT_MODE
segment = vi.VideoSegment(
    start_time_offset=timedelta(seconds=0),
    end_time_offset=timedelta(seconds=37),
)

results = detect_labels(video_uri, mode, [segment])


Processing video "gs://cloud-samples-data/video/JaneGoodall.mp4"...


In [8]:
def print_video_labels(results: vi.VideoAnnotationResults):
    labels = sorted_by_first_segment_confidence(results.segment_label_annotations)

    print(f" Video labels: {len(labels)} ".center(80, "-"))
    for label in labels:
        categories = category_entities_to_str(label.category_entities)
        for segment in label.segments:
            confidence = segment.confidence
            t1 = segment.segment.start_time_offset.total_seconds()
            t2 = segment.segment.end_time_offset.total_seconds()
            print(
                f"{confidence:4.0%}",
                f"{t1:7.3f}",
                f"{t2:7.3f}",
                f"{label.entity.description}{categories}",
                sep=" | ",
            )

In [9]:
def sorted_by_first_segment_confidence(
    labels: Sequence[vi.LabelAnnotation],
) -> Sequence[vi.LabelAnnotation]:
    def first_segment_confidence(label: vi.LabelAnnotation) -> float:
        return label.segments[0].confidence

    return sorted(labels, key=first_segment_confidence, reverse=True)

In [10]:
def category_entities_to_str(category_entities: Sequence[vi.Entity]) -> str:
    if not category_entities:
        return ""
    entities = ", ".join([e.description for e in category_entities])
    return f" ({entities})"

In [11]:
print_video_labels(results)

------------------------------- Video labels: 10 -------------------------------
 96% |   0.000 |  36.960 | nature
 74% |   0.000 |  36.960 | vegetation
 59% |   0.000 |  36.960 | tree (plant)
 56% |   0.000 |  36.960 | forest (geographical feature)
 49% |   0.000 |  36.960 | leaf (plant)
 43% |   0.000 |  36.960 | flora (plant)
 38% |   0.000 |  36.960 | nature reserve (geographical feature)
 37% |   0.000 |  36.960 | woodland (forest)
 35% |   0.000 |  36.960 | water resources (water)
 32% |   0.000 |  36.960 | sunlight (light)


In [12]:
def get_shot_labels(results: vi.VideoAnnotationResults):
    labels = sorted_by_first_segment_start_and_confidence(
        results.shot_label_annotations
    )

    print(f" Shot labels: {len(labels)} ".center(80, "-"))
    video_labels = []
    for label in labels:
        categories = category_entities_to_str(label.category_entities)
        label_description = label.entity.description
        print(f"{label.entity.description}{categories}")
        for segment in label.segments:
            confidence = segment.confidence
            t1 = segment.segment.start_time_offset.total_seconds()
            t2 = segment.segment.end_time_offset.total_seconds()
            print(f"{confidence:4.0%} | {t1:7.3f} | {t2:7.3f}")
        if label_description not in video_labels:
            video_labels.append(label_description)
    return video_labels

In [13]:
def sorted_by_first_segment_start_and_confidence(
    labels: Sequence[vi.LabelAnnotation],
) -> Sequence[vi.LabelAnnotation]:
    def first_segment_start_and_confidence(label: vi.LabelAnnotation):
        first_segment = label.segments[0]
        ms = first_segment.segment.start_time_offset.total_seconds()
        return (ms, -first_segment.confidence)

    return sorted(labels, key=first_segment_start_and_confidence)

In [14]:
video_labels = get_shot_labels(results)

------------------------------- Shot labels: 29 --------------------------------
planet (astronomical object)
 83% |   0.000 |  12.880
earth (planet)
 53% |   0.000 |  12.880
water resources (water)
 43% |   0.000 |  12.880
aerial photography (photography)
 43% |   0.000 |  12.880
vegetation
 32% |   0.000 |  12.880
 92% |  12.920 |  21.680
 83% |  21.720 |  27.880
 77% |  27.920 |  31.800
 76% |  31.840 |  34.720
nature
 96% |  12.920 |  21.680
 96% |  21.720 |  27.880
 96% |  27.920 |  31.800
 96% |  31.840 |  34.720
 49% |  34.760 |  36.960
leaf (plant)
 75% |  12.920 |  21.680
 37% |  21.720 |  27.880
sunlight (light)
 60% |  12.920 |  21.680
 46% |  27.920 |  31.800
flora (plant)
 57% |  12.920 |  21.680
moisture (water)
 38% |  12.920 |  21.680
tree (plant)
 98% |  27.920 |  31.800
forest (geographical feature)
 90% |  27.920 |  31.800
 37% |  31.840 |  34.720
grove (tree)
 77% |  27.920 |  31.800
woodland (forest)
 76% |  27.920 |  31.800
ecosystem (geographical feature)
 58% | 

In [15]:
video_labels

['planet',
 'earth',
 'water resources',
 'aerial photography',
 'vegetation',
 'nature',
 'leaf',
 'sunlight',
 'flora',
 'moisture',
 'tree',
 'forest',
 'grove',
 'woodland',
 'ecosystem',
 'old growth forest',
 'temperate broadleaf and mixed forest',
 'nature reserve',
 'jungle',
 'stream',
 'moss',
 'rainforest',
 'insect',
 'moths and butterflies',
 'butterfly',
 'monarch butterfly',
 'animal',
 'invertebrate',
 'pollinator']

# GCP Video Transcribe

In [16]:
from google.cloud import videointelligence

In [17]:
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]

config = videointelligence.SpeechTranscriptionConfig(
    language_code="en-US", enable_automatic_punctuation=True
)
video_context = videointelligence.VideoContext(speech_transcription_config=config)

In [18]:
operation = video_client.annotate_video(
    request={
        "features": features,
        "input_uri": "gs://cloud-samples-data/video/JaneGoodall.mp4",
        "video_context": video_context,
    }
)
print("\nProcessing video for speech transcription.")


Processing video for speech transcription.


In [19]:
result = operation.result(timeout=600)

# There is only one annotation_result since only
# one video is processed.
annotation_results = result.annotation_results[0]
for speech_transcription in annotation_results.speech_transcriptions:

    # The number of alternatives for each transcription is limited by
    # SpeechTranscriptionConfig.max_alternatives.
    # Each alternative is a different possible transcription
    # and has its own confidence score.
    for alternative in speech_transcription.alternatives:
        print("Alternative level information:")

        print("Transcript: {}".format(alternative.transcript))
        print("Confidence: {}\n".format(alternative.confidence))


        print("Word level information:")
        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            print(
                "\t{}s - {}s: {}".format(
                    start_time.seconds + start_time.microseconds * 1e-6,
                    end_time.seconds + end_time.microseconds * 1e-6,
                    word,
                )
            )

Alternative level information:
Transcript: 
Confidence: 0.0

Word level information:
Alternative level information:
Transcript: 
Confidence: 0.0

Word level information:
Alternative level information:
Transcript: 
Confidence: 0.0

Word level information:
Alternative level information:
Transcript: 
Confidence: 0.0

Word level information:
Alternative level information:
Transcript: I remember, I was struck by the harmony of color and the forest shades of yellow and green, beeping to the Browns and purples. And the way the vines curled up through the trees, clinging to twigs and branches,
Confidence: 0.8694230914115906

Word level information:
	28.8s - 28.9s: I
	28.9s - 29.4s: remember,
	29.4s - 29.5s: I
	29.5s - 29.7s: was
	29.7s - 30.2s: struck
	30.2s - 30.3s: by
	30.3s - 30.4s: the
	30.4s - 30.9s: harmony
	30.9s - 31.1s: of
	31.1s - 31.5s: color
	31.5s - 31.6s: and
	31.6s - 31.7s: the
	31.7s - 32.5s: forest
	33.1s - 33.6s: shades
	33.6s - 33.7s: of
	33.7s - 34.1s: yellow
	34.1s - 34.3s

In [20]:
print(result)

annotation_results {
  input_uri: "/cloud-samples-data/video/JaneGoodall.mp4"
  segment {
    start_time_offset {
    }
    end_time_offset {
      seconds: 162
      nanos: 539682000
    }
  }
  speech_transcriptions {
    alternatives {
    }
    language_code: "en-us"
  }
  speech_transcriptions {
    alternatives {
    }
    language_code: "en-us"
  }
  speech_transcriptions {
    alternatives {
    }
    language_code: "en-us"
  }
  speech_transcriptions {
    alternatives {
    }
    language_code: "en-us"
  }
  speech_transcriptions {
    alternatives {
      transcript: "I remember, I was struck by the harmony of color and the forest shades of yellow and green, beeping to the Browns and purples. And the way the vines curled up through the trees, clinging to twigs and branches,"
      confidence: 0.869423091
      words {
        start_time {
          seconds: 28
          nanos: 800000000
        }
        end_time {
          seconds: 28
          nanos: 900000000
        }
 

In [21]:
video_client = vi.VideoIntelligenceServiceClient()
features = [vi.Feature.SPEECH_TRANSCRIPTION]

segments = vi.VideoSegment(
    start_time_offset=timedelta(seconds=0),
    end_time_offset=timedelta(seconds=37),
)
config = vi.SpeechTranscriptionConfig(
    language_code="en-US", enable_automatic_punctuation=True
)
video_context = vi.VideoContext(segments=[segments], speech_transcription_config=config)

operation = video_client.annotate_video(
    request={
        "features": features,
        "input_uri": "gs://cloud-samples-data/video/JaneGoodall.mp4",
        "video_context": video_context,
    }
)

print("\nProcessing video for speech transcription.")

result = operation.result(timeout=600)

# There is only one annotation_result since only
# one video is processed.
annotation_results = result.annotation_results[0]

# Initialize variables to track the alternative with the highest confidence
highest_confidence = 0
best_transcription = None

for speech_transcription in annotation_results.speech_transcriptions:
    # The number of alternatives for each transcription is limited by
    # SpeechTranscriptionConfig.max_alternatives.
    # Each alternative is a different possible transcription
    # and has its own confidence score.
    for alternative in speech_transcription.alternatives:
        print("Alternative level information:")

        print("Transcript: {}".format(alternative.transcript))
        print("Confidence: {}\n".format(alternative.confidence))

        print("Word level information:")
        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            print(
                "\t{}s - {}s: {}".format(
                    start_time.seconds + start_time.microseconds * 1e-6,
                    end_time.seconds + end_time.microseconds * 1e-6,
                    word,
                )
            )
        if alternative.confidence > highest_confidence:
            highest_confidence = alternative.confidence
            best_transcription = alternative.transcript


Processing video for speech transcription.
Alternative level information:
Transcript: 
Confidence: 0.0

Word level information:
Alternative level information:
Transcript: 
Confidence: 0.0

Word level information:
Alternative level information:
Transcript: 
Confidence: 0.0

Word level information:
Alternative level information:
Transcript: 
Confidence: 0.0

Word level information:
Alternative level information:
Transcript: remember I was struck by the harmony of color in the forest shades of yellow and green deepening to the Browns and per
Confidence: 0.7856012582778931

Word level information:
	28.9s - 29.4s: remember
	29.4s - 29.5s: I
	29.5s - 29.7s: was
	29.7s - 30.2s: struck
	30.2s - 30.3s: by
	30.3s - 30.5s: the
	30.5s - 30.9s: harmony
	30.9s - 31.1s: of
	31.1s - 31.5s: color
	31.5s - 31.6s: in
	31.6s - 31.7s: the
	31.7s - 32.6s: forest
	33.1s - 33.6s: shades
	33.6s - 33.7s: of
	33.7s - 34.1s: yellow
	34.1s - 34.3s: and
	34.3s - 34.9s: green
	35.1s - 35.7s: deepening
	35.7s - 35.8

In [None]:
best_transcription

'remember I was struck by the harmony of color in the forest shades of yellow and green deepening to the Browns and per'

# Embedding label and text transcribe

In [22]:
!pip install -U -q "google-generativeai>=0.8.3"

In [23]:
import google.generativeai as genai
from google.colab import userdata

In [24]:
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [25]:
import numpy as np
def get_embeddings(text: list[str]) -> np.ndarray:
  embeddings = genai.embed_content(model='models/text-embedding-004',
                               content=text,
                               task_type='semantic_similarity')
  embds = embeddings.get('embedding', None)
  embds = np.array(embds).reshape(len(embds), -1)
  return embds

In [26]:
video_label_embedding = get_embeddings(' '.join(video_labels))
transcription_embedding = get_embeddings(best_transcription)

# Creating the embedding database with ChromaDB

In [27]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.10.0-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3

In [28]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry

In [29]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for videos, or queries
    video_mode = True

    def __call__(self, input: Documents) -> Embeddings:
        if self.video_mode_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        retry_policy = {"retry": retry.Retry(predicate=retry.if_transient_error)}

        response = genai.embed_content(
            model="models/text-embedding-004",
            content=input,
            task_type=embedding_task,
            request_options=retry_policy,
        )
        return response["embedding"]

In [30]:
import chromadb

DB_NAME = "googlecardb"
embed_fn = GeminiEmbeddingFunction()
embed_fn.video_mode_mode = True

chroma_client = chromadb.Client()
db = chroma_client.get_or_create_collection(name=DB_NAME, embedding_function=embed_fn)
video_doc = " ".join(video_labels) + " " + best_transcription
documents = [video_doc]

db.add(documents=documents, ids=[str(i) for i in range(len(documents))])
video_doc

'planet earth water resources aerial photography vegetation nature leaf sunlight flora moisture tree forest grove woodland ecosystem old growth forest temperate broadleaf and mixed forest nature reserve jungle stream moss rainforest insect moths and butterflies butterfly monarch butterfly animal invertebrate pollinator remember I was struck by the harmony of color in the forest shades of yellow and green deepening to the Browns and per'

In [31]:
# Confirm that the data was inserted by looking at the database.
db.count()

1

# Retrieval: Find relevant documents

In [32]:
# Switch to query mode when generating embeddings.
embed_fn.video_mode = False

# Search the Chroma DB using the specified query- based on the user interests
query = "Nature"

result = db.query(query_texts=[query], n_results=1)
[[passage]] = result["documents"]

result

{'ids': [['0']],
 'embeddings': None,
 'documents': [['planet earth water resources aerial photography vegetation nature leaf sunlight flora moisture tree forest grove woodland ecosystem old growth forest temperate broadleaf and mixed forest nature reserve jungle stream moss rainforest insect moths and butterflies butterfly monarch butterfly animal invertebrate pollinator remember I was struck by the harmony of color in the forest shades of yellow and green deepening to the Browns and per']],
 'uris': None,
 'data': None,
 'metadatas': [[None]],
 'distances': [[0.663965106010437]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [None]:
# Use id to retrieve the actual video