In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch torchvision transformers opencv-python pandas tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [3]:
import os
import cv2
import pandas as pd
import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from tqdm import tqdm

In [4]:
# Load CSV file
df = pd.read_csv('/content/drive/MyDrive/AIP/MovieData/fixed_downloaded_videos.csv')

# Function to extract frames from video
def extract_frames(video_path, num_frames=16):
    frames = []
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(total_frames // num_frames, 1)
    for i in range(0, total_frames, interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(cv2.resize(frame, (224, 224)))
        if len(frames) == num_frames:
            break
    cap.release()
    return frames

In [5]:
# Load pre-trained model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

In [6]:
# Function to generate captions
def generate_caption(model, feature_extractor, tokenizer, frames):
    inputs = feature_extractor(images=frames, return_tensors="pt")
    pixel_values = inputs.pixel_values
    output_ids = model.generate(pixel_values)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

In [17]:
# Function to generate caption for a specific video
def generate_caption_for_video(video_path, model, feature_extractor, tokenizer):
    frames = extract_frames(video_path)
    caption = generate_caption(model, feature_extractor, tokenizer, frames)
    return caption

# Example usage:
video_path = '/content/drive/MyDrive/AIP/MovieData/test video/coweatinggrass.mp4'  # Specify the path to the video
caption = generate_caption_for_video(video_path, model, feature_extractor, tokenizer)
print(f"Generated Caption: {caption}")



Generated Caption: a cow standing in a grassy field 


In [18]:
caption

'a cow standing in a grassy field '



```
# This is formatted as code
```

## **Converting text to Speech**

In [19]:
!pip install gtts



In [20]:
from gtts import gTTS
import os
from IPython.display import Audio

# Defining  TTS Function
def text_to_audio(text, output_filename):
    tts = gTTS(text)
    tts.save(output_filename)
    print(f"Audio saved as {output_filename}")


output_directory = "/content/drive/MyDrive/AIP/MovieData/audio"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

#Converting Generated Text to Audio
output_audio_file = os.path.join(output_directory, "audiomo.mp3")
text_to_audio(caption, output_audio_file)

#Listening to the Generated Audio
Audio(output_audio_file)
print(caption)

Audio saved as /content/drive/MyDrive/AIP/MovieData/audio/audiomo.mp3
a cow standing in a grassy field 


In [21]:
Audio(output_audio_file)


#Merging Audio and video together

In [22]:
!pip install ffmpeg



In [23]:
import subprocess
video_path = video_path
audio_path = output_audio_file
output_path = '/content/merged_video1.mp4'

command = [
    'ffmpeg',
    '-i', video_path,
    '-i', audio_path,
    '-c:v', 'copy',
    '-c:a', 'aac',
    '-strict', 'experimental',
    '-y',
    output_path
]

# Run the ffmpeg command
subprocess.run(command)


CompletedProcess(args=['ffmpeg', '-i', '/content/drive/MyDrive/AIP/MovieData/test video/coweatinggrass.mp4', '-i', '/content/drive/MyDrive/AIP/MovieData/audio/audiomo.mp3', '-c:v', 'copy', '-c:a', 'aac', '-strict', 'experimental', '-y', '/content/merged_video1.mp4'], returncode=0)

In [24]:
from IPython.display import Video, display
display(Video(output_path))
