# Processing Videos with Spark

This noetbook will focus on converting video formats and scaling that process

In [None]:
%pip install ffmpeg
%restart_python


## Setup and Config

We will setup db catalogs schemas and locations for files


In [None]:
import os

catalog = 'brian_ml_dev'
schema = 'image_processing'
raw_videos = 'raw_data'

destination_folder = 'processed_video'

raw_path = f'/Volumes/{catalog}/{schema}/{raw_videos}'
processed_path = f'/Volumes/{catalog}/{schema}/{destination_folder}'

# check contents
file_list = os.listdir(raw_path)
print(f'Files Available: {file_list}:')

# Distributing the processing - Setup Paths

In order to distribute the processing, we need to:
- convert the metadata into a spark dataframe with source / destination / optional flags

Optional Flags depends on how we structure our udf. If we want to be able to set custom options depending on file then we would have an optional flags column with a dict or other structure so that we can feed it into our udf

In [None]:
filenames = [x.split('.')[0] for x in file_list]
full_path = [os.path.join(raw_path, x+'.mp4') for x in filenames ]
dest_paths = [os.path.join(processed_path, x+'.mp4') for x in filenames]

print(len(dest_paths))

In [None]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("src", StringType(), True),
    StructField("dest", StringType(), True),
])

sourcing_df = spark.createDataFrame(list(x for x in zip(full_path, dest_paths)), schema=schema)
display(sourcing_df)

In [None]:
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import ArrayType, FloatType, IntegerType, BinaryType, BooleanType
import pandas as pd

def check_file_exists(file_path:str) -> bool:
    """
    returns true or files based on if a file is existing or not
    """
    if os.path.exists(file_path):
        if os.path.isfile(file_path):
            #print(f"The file {file_path} exists.")
            return True
        else:
            #print(f"{file_path} exists but it is not a file.")
            return False
    else:
        #print(f"The file {file_path} does not exist.")
        return False

# udf wrapper for the function above
@pandas_udf(BooleanType())
def check_file_esists_udf(input: pd.Series) -> pd.Series:
    return input.apply(check_file_exists)


file_check = sourcing_df.withColumn('file_exists', check_file_esists_udf(col('dest')))
display(file_check)

# Testing out processing function

It is always best to test functions first before we roll it out to scale on a spark cluster

In [None]:
import subprocess

source = f"{raw_path}/{file_list[0]}"
dest_path = '/local_disk0/tt2/20241029-024247-3.mp4'
command = ['ffmpeg', '-i', source, '-filter:v', 'crop=1920:1200:0:0', dest_path]
#'ffmpeg -i /Volumes/prj-orica-video-analytics/data_processing/test_processing/20241029-024247.svo -filter:v "crop=1920:1200:0:0" /local_disk0/tt2/20241029-024247-3.mp4'

text = subprocess.run(command, check=True, capture_output=True, text=True)

# Distributing File Processing

Now that we know the right ffmpeg command we can distribute it

In [None]:
from pyspark.sql.functions import udf

import subprocess
import tempfile
import shutil
import os

def process_video(src: str, dest: str) -> dict:

    # temp file cache
    db_expandable_storage = "/local_disk0"

    with tempfile.TemporaryDirectory(dir=db_expandable_storage) as tmp_dir:
        tmp_output_path = os.path.join(tmp_dir, "output.mp4")  # Adjust filename if needed

        # Construct the ffmpeg command
        command = ['ffmpeg', '-i', src, '-filter:v', 'crop=1920:1200:0:0', tmp_output_path]

        try:
            # Run ffmpeg command
            command_result = subprocess.run(command, check=True, capture_output=True, text=True)

            # Copy the processed file to cloud storage
            shutil.copy(tmp_output_path, dest)

            return {
                'command_args': ' '.join(command_result.args),
                'return_code': command_result.returncode,
                'stdout': command_result.stdout,
                'stderr': command_result.stderr
            }

        except subprocess.CalledProcessError as e:
            return {
                'command_args': ' '.join(e.cmd),
                'return_code': e.returncode,
                'stdout': '',
                'stderr': e.stderr
            }
        except Exception as ex:
            return {
                'command_args': ' '.join(command),
                'return_code': -1,
                'stdout': '',
                'stderr': str(ex)
            }

# to convert a function into a pyfunc we need to define the output schema in PySpark terms
schema = StructType([
    StructField("command_args", StringType(), True),
    StructField("return_code", IntegerType(), True),
    StructField("stdout", StringType(), True),
    StructField("stderr", StringType(), True)
])

# Register and wrap the python function into a udf wrapper
process_video_udf = udf(process_video, schema)

In [None]:
df = file_check \
    .filter(col('file_exists') == False) \
    .withColumn("video_processing", process_video_udf(file_check.src, file_check.dest))

display(df.select("video_processing.*").show(truncate=False))