Copyright (c) 2025 Christian Oechler

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 

## Importing the necessary python packages

In [None]:
import cv2
import json
import re
import torch

from pathlib import Path
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM 
from tqdm.notebook import tqdm
from typing import Tuple

## Definition of the ocr machine

In [None]:
class OCRMachine():
    def __init__(self):
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        if device == "cpu":
            print("ACHTUNG: Keine Beschleunigung durch eine Grafikkarte verfügbar!")
        
        self.model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base",
                                                            torch_dtype=torch_dtype,
                                                            trust_remote_code=True,
                                                            cache_dir="models/"
                                                           ).to(device)
        
        self.processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True, cache_dir="models/")


    def do_ocr(self, frame) -> str:
        # Convert the image from BGR (OpenCV default) to RGB (PIL default)
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Loads the image from the array
        image = Image.fromarray(image_rgb)

        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        prompt = "<OCR>"
        inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
        generated_ids = self.model.generate(input_ids=inputs["input_ids"],
                                                      pixel_values=inputs["pixel_values"],
                                                      max_new_tokens=4096,
                                                      num_beams=3,
                                                      do_sample=False
                                                     )
                
        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
                
        output = self.processor.post_process_generation(generated_text, task="<OCR>", image_size=(image.width, image.height))
                
        return output["<OCR>"]

## Defination of the helper functions

In [None]:
def load_processed_data(filname):
    try:
        with open(filname, 'r') as json_file:
            file_list = json.load(json_file)

        return file_list
    
    except Exception:
        return {}

In [None]:
def extract_date_text(machine, frame_part, verbose=False) -> Tuple[str, str]:
    text = machine.do_ocr(frame_part)

    if verbose:
        print(text)
    
    pattern = r"(\d{2}-\d{2}-\d{4})[\s-](\d{2}:\d{2}:\d{2})"

    match = re.match(pattern, text)

    if not match:
        raise ValueError(f'Es konnten keine Daten aus dem Datums-Text extrahiert werden. Der Text lautete: {text}')

    assert len(match.groups()) == 2, f'Es wurden nicht genügend Daten aus dem Datums-Text extrahiert. Der Text lautete: {text}'

    # Überprüfen, ob ein Treffer gefunden wurde
    if match:
        date = match.group(1)
        time = match.group(2)

        return date, time
    else:
        raise ValueError("Konnte die benötigten Daten nicht aus dem Datums-Text extahieren.")

In [None]:
def extract_cam_text(machine, frame_part, verbose=False):
    text = machine.do_ocr(frame_part)

    if verbose:
        print(text)
    
    return text

In [None]:
def get_video_metadata(ocr_machine, path_to_videofile, verbose=False) -> Tuple[str, str, str]:
    if isinstance(path_to_videofile, str):
        video_file = path_to_videofile
    else:
        video_file = str(path_to_videofile)

    if verbose:
        print(f'Folgende Datei wird bearbeitet: {video_file}')
    
    try:
        cap = cv2.VideoCapture(f'{video_file}')

        frames_per_second = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        list_of_frame_indices = [frames_per_second * i for i in range(0, int(total_frames / frames_per_second)) if frames_per_second * i <= total_frames]

        if not cap.isOpened():
            raise BaseException("Das Video konnte nicht geöffnet werden.")
            
            
        for frame_index in list_of_frame_indices:
            try:
                # Set the current frame position
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
            
                # Read the frame
                ret, frame = cap.read()
    
                # Guard to ensure the frame was read correctly
                if not ret:
                    continue
    
                # Calculate the ratio between weidth and hight to dectect the video orientation
                height, width, _ = frame.shape
                image_ratio = width / height
    
                # Crops the frame relevat frame parts of the date and the cam
                if image_ratio > 1:
                    cropped_frame_date = frame[:150, 1300:]
                    cropped_frame_cam = frame[950:, 0:300]
                else:
                    cropped_frame_date = frame[0:250, 400:]
                    cropped_frame_cam = frame[1700:, :370]
                    
                video_date, video_time = extract_date_text(ocr_machine, cropped_frame_date, verbose)
                video_camera = extract_cam_text(ocr_machine, cropped_frame_cam, verbose)
    
                cap.release()

            except Exception:
                continue
            
            return video_date, video_time, video_camera
            
    
    except BaseException as e:
        print(f'Es ist ein Fehler beim Bearbeiten der Datei "{video_file}" aufgetreten: ', end="")
        print(e)
    
    finally:
        pass
        #if cap:
        #    cap.release()

## Extraction of the metadata from the video file content

### Setup the system

In [None]:
MAIN_DIRECTORY_VIDEO_FILES = "videos"

In [None]:
machine = OCRMachine()

### Start the extraction process of all videos

In [None]:
main_directory_path = Path(MAIN_DIRECTORY_VIDEO_FILES)


files = []

# Gets all file pathes of the videos
for video_directory in main_directory_path.iterdir():
    for video_file_path in video_directory.iterdir():
        files.append(video_file_path)

file_dictornary = load_processed_data("data/data.json")

with tqdm(files, desc="Fortschritt", unit="Datei") as pbar:
    for i, videofile in enumerate(pbar):
        pbar.set_postfix_str(f"{videofile.name}")

        try:
            if str(videofile) in file_dictornary:
                continue
            
            # Get the unique video id from the filename
            match = re.search(r'\((\d+)\)', str(videofile))

            # Guard which throws an expetion if id could not be extracted from filename
            if match is None:
                raise ValueError(f'Es konnte die eindeutige ID des Videos {videofile} nicht ermittelt werden')

            # Saves the metadata of the video file
            video_id = match.group(1)
            video_date, video_time, video_camera = get_video_metadata(machine, videofile)

            metadata = {"video_id": video_id, "video_date": video_date, "video_time": video_time, "video_camera": video_camera}
            
            file_dictornary[str(videofile)] = metadata
    
        except BaseException as e:
            print(f'Es ist ein Fehler beim Bearbeiten der Datei "{videofile}" aufgetreten.')
            print(e)
    
            with open("error.txt", 'a') as file:
                file.write(f'{videofile} \n')
    
        finally:
            # Open the file in write mode and save the JSON data
            with open('data/data.json', 'w') as json_file:
                json.dump(file_dictornary, json_file, indent=4)
    
            # Open the file in write mode and save the JSON data
            with open(f'data/data.json.{int(i/250)}.backup', 'w') as json_file:
                json.dump(file_dictornary, json_file, indent=4)

In [None]:
get_video_metadata(machine, "videos/154000~154400/Wiederhergestellt_mp4_Datei(154377).mp4", verbose=True)