## Imports & Installs

In [None]:
# Mount Drive & navigate to folder
from google.colab import drive
drive.mount('/content/drive')

# You may need to change the line below to the location of the _AMPPD folder if 
# the _AMPPD folder isn't in the 'root' of your Drive ('My Drive')
%cd '/content/drive/My Drive/_AMPPD TEAM SHARED FOLDER'

%cd 'MGMs/Video OCR/Tesseract'

In [None]:
# Install tesseract
!pip install pytesseract
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev

In [None]:
# Checking tesseract version
import pytesseract
pytesseract.get_tesseract_version()

In [None]:
# Python imports
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
import os
import json

from datetime import timedelta
from decimal import Decimal

In [None]:
# sample tesseract output
from pytesseract import Output
result = pytesseract.image_to_data(Image.open("temp/little500.mp4/frame_00001.jpg"), output_type=Output.DICT) 
print(result)

## Generate samples for testing

In [None]:
# Make samples from full videos for testing

#!ffmpeg -ss 00:8:00.0 -i "../../../media/IU Archives/Little 500 Variety Video/access/Little 500 Variety Video.mp4" -c copy -t 00:05:00.0 "little500.mp4"
#!ffmpeg -ss 00:00:00.0 -i "../Day_of_desperation-first_10min.mp4" -c copy -t 00:05:00.0 "dod.mp4"
#!ffmpeg -ss 00:15:00.0 -i "../West_side_story-first_30min.mp4" -c copy -t 00:05:00.0 "wss.mp4"
        
samples = [{"filename":"little500.mp4", "sample_start":480, "sample_duration":300}, 
           {"filename":"dod.mp4", "sample_start":0, "sample_duration":300}, 
           {"filename":"wss.mp4", "sample_start":900, "sample_duration":300},
           {"filename":"little_500_end_credits.mp4", "sample_start":1644, "sample_duration":52},
           {"filename":"astin_patten_transparencies.mp4", "sample_start":2047, "sample_duration":472}]

## Save frames

For each video in the samples, save 2 frames every second as `.jpg` in a temporary directory.

In [None]:
#clear old images
!rm -rf "temp"/*

# Save 2 frames a second
# Could probably do this with cv2 as well. Not sure how to save every nth frame
# https://www.geeksforgeeks.org/python-program-extract-frames-using-opencv/
import time

for s in samples:
  start = time.time()
  name = s["filename"]
  !mkdir "temp/{name}"
  !ffmpeg -i "../Clips/{name}" -an -vf fps=2 "temp/{name}/frame_%05d.jpg" 
  print("Finished " + name + " in " + str(time.time()-start) + "s")


## Utility Functions

In [None]:
# UTIL FUNCTIONS
def getDimensions(path):
  dim = !ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=s=x:p=0 "{path}"
  print(path)
  print(dim)
  return dim[0].split("x")


def getFramerate(path):
  fr = !ffprobe -v error -select_streams v:0 -show_entries stream=r_frame_rate -of default=nokey=1:noprint_wrappers=1  "{path}"
  print(int(fr[0].split('/')[0]))
  return int(fr[0].split('/')[0])

def getNumFrames(path):
  nf = !ffprobe -v error -select_streams v:0 -show_entries stream=nb_frames -of default=nokey=1:noprint_wrappers=1 "{path}"
  print(int(nf[0]))
  return int(nf[0])
  

## Run Tesseract & get JSON outputs

For every sample, run Tesseract OCR on every saved frame. Write the results to a JSON file.

In [None]:
# RUN TESSERACT & GET JSON OUTPUT
from pytesseract import Output
import time

for s in samples: # for all our sample videos
  
  print("Running: " + s["filename"])
  script_start = time.time()

  directory = "temp/" + s["filename"]

  output_name = s["filename"] + "-ocr.json" # name an ouput file for our final json

  # Get some stats on the video
  dim = getDimensions("../Clips/"+s['filename'])
  framerate = getFramerate("../Clips/"+s['filename'])
  numFrames = getNumFrames("../Clips/"+s['filename'])

  # Establish the basic fields of our output schema
  output = {
      "media": {
          "filename": s["filename"],
          "duration": str(s["sample_duration"]),
          "framerate": framerate,
          "numFrames": numFrames,
          "resolution": {
              "width": int(dim[0]),
              "height": int(dim[1])
          }
          
      },
      "frames": []
  }

  start_time = s["sample_start"] #adjust for our samples not starting at 0:00
  for num, img in enumerate(sorted(os.listdir(directory))): #for every saved frame
      start_time =+ (.5*num) # Add multiples of 1/2 to the start time because we saved 2 frames a second
      frame = {
            "start": str(start_time),
            "boundingBoxes": []
        }
      
      #Run OCR
      result = pytesseract.image_to_data(Image.open(directory+"/"+img), output_type=Output.DICT) 
      
      #For every result, make a box & add it to the list of boxes for this frame
      for i in range(len(result["text"])): 
        if result["text"][i].strip(): #if the text isn't empty/whitespace
          box = {
              "text": result["text"][i],
              "score": {
                  "type":"confidence",
                  "scoreValue": result["conf"][i]
              },
              # relative coords
              "vertices": {
                "xmin": result["left"][i]/output["media"]["resolution"]["width"],
                "ymin": result["top"][i]/output["media"]["resolution"]["height"],
                "xmax": (result["left"][i] + result["width"][i])/output["media"]["resolution"]["width"],
                "ymax": (result["top"][i] + result["height"][i])/output["media"]["resolution"]["height"]
                }
          }
          frame["boundingBoxes"].append(box)
      
      #save frame if it had text
      if len(frame["boundingBoxes"]) > 0:
        output["frames"].append(frame)
        #print(frame)
  
  with open(output_name, 'w') as outfile:
    json.dump(output, outfile)
  print("Finished " + output_name + " in " + str(time.time()-script_start) + "s")

## Visualize Results

In [None]:
# VISUALIZE RANDOM RESULT

import random
import cv2
from matplotlib import pyplot as plt
from matplotlib import patches, text
import json

# Sample to choose from
sample = samples[0]#random.randrange(0,2)]

# Open JSON file
with open(sample["filename"] + "-ocr.json", 'r') as infile:
  data = json.load(infile)
  
# Select a random frame to show  
n = random.randrange(0, len(data["frames"]))
frame = data["frames"][n]
print(frame)


# Prep plot
fig, ax = plt.subplots(figsize=(20,12)) 


# Capture video @ start time
frame_time = frame["start"] - sample["sample_start"] # adjusting for the fact not all samples start @ 0
vidcap = cv2.VideoCapture("../Clips/"+sample["filename"])
print(frame_time)
vidcap.set(cv2.CAP_PROP_POS_MSEC, frame_time*1000 + 240) # FIX: why is this 240 adjustment needed?
success,image = vidcap.read()

# Plot image
if success:
    ax.imshow(image[...,::-1]) # bgr to rgb

    
# Plot OCR results
for box in frame["boundingBoxes"]:
  x = box["vertices"][0]["x"]
  y = box["vertices"][0]["y"]
  w = box["vertices"][1]["x"] - x
  h = box["vertices"][1]["y"] - y
  
  rect = patches.Rectangle((x,y),w,h, linewidth=2, edgecolor='r', facecolor='none')
  ax.add_patch(rect)
  ax.text(x-2,y-2, box["text"], fontsize=18, color='r')
  
  
plt.show()

