### Import necessary libraries

In [1]:
import os
import cv2
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
from googletrans import Translator
from gtts import gTTS
import numpy as np

### Set up the image captioning environment


In [2]:
# Disable parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Initialize the processor and model fro image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

# pre-trained BLIP model from Salesforce for generating captions from images
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Initialize the semantic similarity model (pre-trained SentenceTransformer model)to measure the similarity between sentences
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

### Create caption for frame   
##### Convert a single frame to text using the model 

In [3]:
def frame_to_text(frame):
    # Convert the frame (an array) to an image object
    image = Image.fromarray(frame)
    
    # Process the image to prepare it for the model, returning tensors in PyTorch format
    inputs = processor(image, return_tensors="pt")
    
    # Use the model to generate a text description for the image, allowing up to 50 new tokens
    out = model.generate(**inputs, max_new_tokens=50)
    
    # Decode the generated output to a readable string, skipping special tokens
    return processor.decode(out[0], skip_special_tokens=True)




###  Filter out similar text descriptions

In [4]:
def get_unique_meanings(texts, threshold=0.75):
    
    # List to store unique texts
    unique_texts = []  
    
    for text in texts:
        # Convert the current text to a numerical format (embedding)
        text_embedding = semantic_model.encode(text, convert_to_tensor=True)
        #check if the text is unique
        is_unique = True  
        
        for unique_text in unique_texts:
            # Convert the existing unique text to a numerical format (embedding)
            unique_text_embedding = semantic_model.encode(unique_text, convert_to_tensor=True)
            # Calculate the similarity between the current text and the unique text
            similarity = util.pytorch_cos_sim(text_embedding, unique_text_embedding).item()
            # If similarity is above the threshold, mark as not unique
            
            if similarity > threshold:
                is_unique = False
                break
        # If the text is unique, add it to the list of unique texts
        if is_unique:
            unique_texts.append(text)
    
    # Return the list of unique texts
    return unique_texts  



### Processes a video to generate meaningful text descriptions based on repeated actions

In [5]:
def video_to_text(video_path, repeat_threshold=5):

    # Open the video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    # Variable to store the current action detected
    current_action = None
    # count how many times the current action is repeated
    action_count = 0
    # List to store text descriptions of frames
    frame_texts = []

    while True:
        # Read a frame from the video
        ret, frame = cap.read()
        if not ret:
            break

        # Generate a text description for the frame
        text = frame_to_text(frame)

        # Check if the text is the same as the current action
        if text == current_action:
            action_count += 1
        else:
            # If the action count meets or exceeds the threshold, add it to frame_texts
            if action_count >= repeat_threshold and current_action is not None:
                frame_texts.append(f"'{current_action}'.")
            current_action = text
            action_count = 1

    # Check the last action after exiting the loop
    if action_count >= repeat_threshold and current_action is not None:
        frame_texts.append(f"'{current_action}'.")
        
    # Release the video capture object
    cap.release()

    # Filter out similar text descriptions to get unique meanings
    unique_frame_texts = get_unique_meanings(frame_texts)

    # Join the unique descriptions into a single string
    video_description = " ".join(unique_frame_texts)

    # Return the final video description
    return video_description

###  Translating text from English to Hausa and convert that translated text into speech

In [6]:
def text_to_speech(text, lang='en'):
    # Create a gTTS (Google Text-to-Speech) object with the text and language
    tts = gTTS(text=text, lang=lang)  
    # Save the generated speech to an MP3 file named 'output.mp3'
    tts.save("output.mp3")  
    # Play the audio file using the system's default audio player
    os.system("afplay output.mp3")

def translate_text(text, target_lang='ha'):
    # Create a Translator object from the googletrans library
    translator = Translator()  
    # Translate the text to the target language
    translation = translator.translate(text, dest=target_lang)
    # Return the translated text
    return translation.text  


### Test

In [7]:
video_path = "/Users/ronny/Downloads/Test_Videos/Football.mp4"

description = video_to_text(video_path)
print("Original Description:", description)

print()

# Translate the description to Hausa
translated_description = translate_text(description, target_lang='ha')
print("Translated Description:", translated_description)

# Convert the translated text to speech
text_to_speech(translated_description, lang='ha')

Original Description: 'a group of men playing soccer on a field'. 'a soccer player is kicking the ball'. 'a soccer player is trying to block the ball'.

Translated Description: 'rukunin maza suna wasa ƙwallon ƙafa a filin'.'Dan wasan ƙwallon ƙafa ya koma kwallon'.'Dan wasan ƙwallon ƙafa yana ƙoƙarin toshe kwallon..
