In [1]:
# Local Video Text Extraction from Google Drive in Google Colab

# Step 1: Install required packages
!pip install openai-whisper torch torchaudio #openaiwhisper transcribes audio to text,torch is the engine,torchaudio processing, and transforming audio.
!pip install moviepy  # For video processing to audio if needed.

# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 3: Import libraries
import whisper        # Load the Whisper model.
import torch          # Backend engine for AI processing.
import re             # For cleaning text (regex operations).
import os             # File management (list folders, move files).
from pathlib import Path  # Easier handling of file paths.
import json           # Saving and loading transcripts/metadata.
from datetime import datetime  # Timestamps for files or logs.

# Step 4: List files in your Google Drive (optional - to find your video)
def list_drive_files(folder_path="/content/drive/MyDrive"):
    """List files in Google Drive folder"""

    print("Files in your Google Drive:")
    for root, dirs, files in os.walk(folder_path):
        level = root.replace(folder_path, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files:
            if file.endswith(('.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.m4v')):
                print(f"{subindent}📹 {file}")
            elif file.endswith(('.mp3', '.wav', '.m4a', '.flac', '.aac')):
                print(f"{subindent}🎵 {file}")

# Step 5: Transcribe video/audio file
def transcribe_local_file(file_path, model_size="base", language=None):
    """Transcribe local video or audio file using Whisper"""

    # Check if file exists
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return None

    print(f"Loading Whisper model: {model_size}")
    model = whisper.load_model(model_size) #It Downloads the Model Automatically

    print(f"Transcribing: {os.path.basename(file_path)}")

    # Transcribe with options
    options = {
        "fp16": torch.cuda.is_available(),  # Use fp16 if GPU available
        "language": language,  # Auto-detect if None
        "task": "transcribe"
    }

    result = model.transcribe(file_path, **options)

    return result

# Step 6: Search for keywords with timestamps
def search_keywords_with_context(result, keywords, context_seconds=10):
    """Search for keywords in transcript with timestamp context"""

    matches = []

    for segment in result['segments']:
        text = segment['text']
        start_time = segment['start']
        end_time = segment['end']

        for keyword in keywords:
            if keyword.lower() in text.lower():
                # Format timestamp
                start_min = int(start_time // 60)
                start_sec = int(start_time % 60)

                matches.append({
                    'keyword': keyword,
                    'text': text.strip(),
                    'start_time': start_time,
                    'end_time': end_time,
                    'formatted_time': f"{start_min:02d}:{start_sec:02d}",
                    'confidence': segment.get('confidence', 0)
                })

    return matches

# Step 7: Advanced phrase search
def search_phrases_in_segments(result, phrases):
    """Search for exact phrases across segments"""

    matches = []
    full_text = result['text']

    for phrase in phrases:
        pattern = re.compile(re.escape(phrase), re.IGNORECASE)

        for match in pattern.finditer(full_text):
            # Find which segment this belongs to
            char_position = match.start()

            # Calculate approximate timestamp based on character position
            char_per_second = len(full_text) / result['segments'][-1]['end'] if result['segments'] else 1
            approx_time = char_position / char_per_second

            # Get context
            start_context = max(0, match.start() - 100)
            end_context = min(len(full_text), match.end() + 100)
            context = full_text[start_context:end_context]

            matches.append({
                'phrase': phrase,
                'context': context,
                'approximate_time': approx_time,
                'formatted_time': f"{int(approx_time//60):02d}:{int(approx_time%60):02d}",
                'exact_match': match.group()
            })

    return matches

# Step 8: Extract and analyze topics
def extract_topics_by_length(result, min_words=10, max_words=50):
    """Extract segments of specific length that might contain topics"""

    topics = []

    for segment in result['segments']:
        words = segment['text'].split()
        word_count = len(words)

        if min_words <= word_count <= max_words:
            topics.append({
                'text': segment['text'].strip(),
                'word_count': word_count,
                'start_time': segment['start'],
                'end_time': segment['end'],
                'duration': segment['end'] - segment['start'],
                'formatted_time': f"{int(segment['start']//60):02d}:{int(segment['start']%60):02d}"
            })

    return topics

# Step 9: Save results to files
def save_results(results, base_filename="video_analysis"):
    """Save analysis results to files"""

    # Save full transcript
    with open(f'/content/drive/MyDrive/{base_filename}_transcript.txt', 'w', encoding='utf-8') as f:
        f.write(results['full_transcript'])

    # Save keyword matches
    with open(f'/content/drive/MyDrive/{base_filename}_keywords.json', 'w', encoding='utf-8') as f:
        json.dump(results['keyword_matches'], f, indent=2, ensure_ascii=False)

    # Save analysis summary
    summary = {
        'video_file': results['video_file'],
        'analysis_date': datetime.now().isoformat(),
        'transcript_length': len(results['full_transcript']),
        'total_segments': len(results['segments']),
        'keyword_matches_count': len(results['keyword_matches']),
        'topics_found': len(results.get('topics', []))
    }

    with open(f'/content/drive/MyDrive/{base_filename}_summary.json', 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)

    print(f"Results saved to Google Drive:")
    print(f"- {base_filename}_transcript.txt")
    print(f"- {base_filename}_keywords.json")
    print(f"- {base_filename}_summary.json")

# Step 10: Main processing function
def process_local_video(video_path, keywords, phrases=None, model_size="base"):
    """Complete pipeline for local video processing"""

    print(f"Processing local video: {video_path}")

    # Transcribe
    result = transcribe_local_file(video_path, model_size)

    if not result:
        return None

    # Search keywords
    keyword_matches = search_keywords_with_context(result, keywords)

    # Search phrases if provided
    phrase_matches = []
    if phrases:
        phrase_matches = search_phrases_in_segments(result, phrases)

    # Extract topics
    topics = extract_topics_by_length(result)

    # Compile results
    analysis_results = {
        'video_file': os.path.basename(video_path),
        'full_transcript': result['text'],
        'segments': result['segments'],
        'keyword_matches': keyword_matches,
        'phrase_matches': phrase_matches,
        'topics': topics,
        'detected_language': result.get('language', 'unknown')
    }

    return analysis_results

# Step 11: Usage example
if __name__ == "__main__":
    # First, list files to find your video
    print("Scanning Google Drive for video files...")
    list_drive_files()

    # Set your video path - UPDATE THIS PATH
    video_path = "/content/drive/MyDrive/videoplayback.mp4"  # Change this to your actual video path

    # Define keywords to search for
    keywords_to_search = [
"real estate",
"property investment",
"affordable housing",
"commercial property",
"rental yield",
"capital gains",
"title deed",
"leasehold",
"freehold",
"REIT",
"project finance",
"joint venture",
"real estate law",
"zoning regulations",
"building permit"
    ]

    # Define phrases to search for (optional)
    phrases_to_search = [
    "real estate investment",
    "affordable housing project",
    "commercial property market",
    "rental income strategy",
    "property title deed",
    "leasehold land rights",
    "freehold property ownership",
    "joint venture agreement",
    "real estate financing",
    "public private partnership",
    "zoning and planning laws",
    "building code compliance",
    "capital gains tax",
    "real estate regulatory framework",
    "infrastructure-led development"
    ]

    # Process the video
    print(f"\nProcessing video: {video_path}")
    results = process_local_video(
        video_path,
        keywords_to_search,
        phrases_to_search,
        model_size="base"  # Options: tiny, base, small, medium, large
    )

    if results:
        print(f"\n✅ Analysis Complete!")
        print(f"Video: {results['video_file']}")
        print(f"Language detected: {results['detected_language']}")
        print(f"Transcript length: {len(results['full_transcript'])} characters")
        print(f"Total segments: {len(results['segments'])}")

        # Display keyword matches
        print(f"\n🔍 Found {len(results['keyword_matches'])} keyword matches:")
        for match in results['keyword_matches'][:10]:  # Show first 10
            print(f"  ⏰ {match['formatted_time']} - '{match['keyword']}'")
            print(f"     Context: {match['text'][:100]}...")

        # Display phrase matches
        if results['phrase_matches']:
            print(f"\n📝 Found {len(results['phrase_matches'])} phrase matches:")
            for match in results['phrase_matches'][:5]:  # Show first 5
                print(f"  ⏰ {match['formatted_time']} - '{match['phrase']}'")

        # Display interesting topics
        print(f"\n💡 Found {len(results['topics'])} potential topics:")
        for topic in results['topics'][:5]:  # Show first 5
            print(f"  ⏰ {topic['formatted_time']} - {topic['text'][:80]}...")

        # Save results
        save_results(results, "video_analysis")

    else:
        print("❌ Failed to process video")

# Step 12: Helper functions for specific searches
def search_technical_terms(result):
    """Search for common technical terms"""

    technical_terms = [
    "title deed",
    "lease agreement",
    "freehold",
    "leasehold",
    "zoning",
    "easement",
    "land use",
    "conveyancing",
    "due diligence",
    "real estate appraisal",
    "escrow",
    "capital gains",
    "mortgage",
    "stamp duty",
    "building permit",
    "property tax",
    "joint venture",
    "valuation",
    "real estate financing"
    ]

    return search_keywords_with_context(result, technical_terms)

def search_business_terms(result):
    """Search for business-related terms"""

    business_terms = [
        "revenue", "profit", "loss", "investment", "ROI", "KPI", "metrics",
        "strategy", "market", "customer", "client", "sales", "marketing",
        "budget", "cost", "price", "value", "growth", "competition"
    ]

    return search_keywords_with_context(result, business_terms)

# Quick setup instructions:
print("""
🚀 QUICK SETUP INSTRUCTIONS:

1. Upload your video to Google Drive
2. Run the drive.mount() cell to connect to Google Drive
3. Update the video_path variable with your actual video path
4. Customize keywords_to_search with your specific terms
5. Run the main processing function
6. Check the results and saved files in your Google Drive

📝 Supported formats: MP4, AVI, MKV, MOV, WMV, FLV, WebM, M4V, MP3, WAV, M4A, FLAC, AAC

⚡ Model options (speed vs accuracy):
- tiny: Fastest, least accurate
- base: Good balance (recommended)
- small: Better accuracy
- medium: High accuracy, slower
- large: Best accuracy, slowest
""")

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m706.6/803.2 kB[0m [31m22.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=50a6d7f8441f462d5aaab63481716344ac0617a3ef3b51e2441d74e52f2338ef
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d1

100%|███████████████████████████████████████| 139M/139M [00:16<00:00, 8.72MiB/s]


Transcribing: videoplayback.mp4

✅ Analysis Complete!
Video: videoplayback.mp4
Language detected: en
Transcript length: 8507 characters
Total segments: 187

🔍 Found 0 keyword matches:

💡 Found 74 potential topics:
  ⏰ 00:00 - Because I think right now, if I was to quit rapping and I say I want to go to th...
  ⏰ 00:15 - Why would you think you could make it to the NBA?...
  ⏰ 00:24 - Without a doubt you will see me on the UFC in the near future....
  ⏰ 01:06 - Breaks the world record for the fastest time to run a mile....
  ⏰ 01:14 - At the time, everyone believed that four minutes was the barrier....
Results saved to Google Drive:
- video_analysis_transcript.txt
- video_analysis_keywords.json
- video_analysis_summary.json

🚀 QUICK SETUP INSTRUCTIONS:

1. Upload your video to Google Drive
2. Run the drive.mount() cell to connect to Google Drive
3. Update the video_path variable with your actual video path
4. Customize keywords_to_search with your specific terms
5. Run the main processi