# imports

In [90]:
import glob, os, sys, io, skvideo.io, argparse, math, datetime, ffmpy, shutil, wikipedia
from google.cloud import videointelligence_v1p1beta1 as videointelligence
from google.cloud import vision
from google.cloud import storage
from google.cloud.vision import types
from google.oauth2 import service_account
from PIL import Image, ImageDraw
import numpy as np
import deepdish as dd

In [2]:
# credential path must be set outside function
credentials = service_account.Credentials.from_service_account_file('../../google-credentials/memory-dynamics-73f025c2a094.json')

# feature extraction with google cloud video intelligence api

In [10]:
def extract_features(movie_to_process, bucket_name):
    
    #path to video input file and JSON output file
    #input_uri = 'gs://' + bucket_name + '/video_processing/' + movie_to_process
    input_uri = 'gs://sherlock_movie/sherlock-mini-test.mp4'
    
    video_client = videointelligence.VideoIntelligenceServiceClient(credentials=credentials)
    
    # request extracted features
    features = [videointelligence.enums.Feature.LABEL_DETECTION, videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]

    # request shot-level and frame_level labels, assign to label detection config
    label_mode = videointelligence.enums.LabelDetectionMode.SHOT_AND_FRAME_MODE
    label_config = videointelligence.types.LabelDetectionConfig(label_detection_mode=label_mode)
    
    # set language context to British english FOR SHERLOCK
    speech_config = videointelligence.types.SpeechTranscriptionConfig(language_code='en-GB')
    
    # pass label detection and speech transcription configs to context 
    context = videointelligence.types.VideoContext(label_detection_config=label_config, speech_transcription_config=speech_config)

    # define analysis from parameters
    operation = video_client.annotate_video(input_uri, features=features, video_context=context)
    print('processing video')
    
    # check if operation is done every 90s, return if done
    result = operation.result(timeout=5000)
    print('finished processing video')
    
    return result
    

In [11]:
# test
test_result = extract_features('sherlock-mini-test.mp4','sherlock_movie')


processing video

finished processing video


In [91]:
dd.io.save('../data/gcloud outputs/sherlock-mini-test', test_result)

In [None]:
# movie_to_process is sherlock_video.mp4
# bucket is sherlock_movie

# where do I want to output to?

annotation_results {
  input_uri: "/sherlock_movie/sherlock-mini-test.mp4"
  segment_label_annotations {
    entity {
      entity_id: "/m/09kqc"
      description: "humour"
      language_code: "en-US"
    }
    category_entities {
      entity_id: "/m/01g317"
      description: "person"
      language_code: "en-US"
    }
    segments {
      segment {
        start_time_offset {
        }
        end_time_offset {
          seconds: 45
          nanos: 40000000
        }
      }
      confidence: 0.4066888689994812
    }
  }
  segment_label_annotations {
    entity {
      entity_id: "/m/0f2f9"
      description: "television program"
      language_code: "en-US"
    }
    segments {
      segment {
        start_time_offset {
        }
        end_time_offset {
          seconds: 45
          nanos: 40000000
        }
      }
      confidence: 0.6629244089126587
    }
  }
  shot_label_annotations {
    entity {
      entity_id: "/m/083mg"
      description: "walking"
      language_c

# scene details

In [None]:
# needs work

# characters on screen (analogous to name - all)

### get frames with people in them

In [83]:
def get_person_frames(result):
    frame_offsets = []
    frame_labels = result.annotation_results[0].frame_label_annotations
    for i, frame_label in enumerate(frame_labels):
        for category_entity in frame_label.category_entities:
            if category_entity.description == 'person':
                frame1 = frame_label.frames[0]
                time_offset = (frame1.time_offset.seconds + frame1.time_offset.nanos / 1e9)
                frame_offsets.append(time_offset)
    return(sorted(set(frame_offsets)))

In [85]:
# test
get_person_frames(test_result)

[0.733627,
 4.660952,
 5.696047,
 8.995336,
 16.049651,
 20.269384,
 36.923075,
 42.104165,
 43.253441]

### extract image frames

In [86]:
# get image from video
def extract_image_from_video(video_input, name_output, time_stamp):
    ret = 'Error'
    try:
        ret = os.system('ffmpeg -i ' + video_input + ' -ss ' + time_stamp + ' -frames:v 1 ' + name_output)    
        # ret should = 0 if successful
        return ret
    except ValueError:
        return('Error in extract_image_from_video')

# crop around faces in extracted frames
def crop_image(input_image, output_image, start_x, start_y, width, height):
    
    input_img = Image.open(input_image)
    # crop with buffer space for mistakes & to for google vision to handle 
    start_with_buffer_x = int(start_x - np.ceil(width/2))
    start_with_buffer_y = int(start_y - np.ceil(height/2))
    width_with_buffer = int(start_x + width  + np.ceil(width/2))
    height_with_buffer = int(start_y + height  + np.ceil(height/2))

    box = (start_with_buffer_x, start_with_buffer_y, width_with_buffer, height_with_buffer)
    output_img = input_img.crop(box)
    output_img.save('../images/' + output_image + '.png')
    #return ('../images/' + output_image + '.png')
    return
    
# pass cropped images to google visioin
def detect_face(face_file, max_results=4):
    # return coordinates of face in frame
    client = vision.ImageAnnotatorClient()
    content = face_file.read()
    image = types.Image(content=content)

    # return face coords
    return client.face_detection(image=image).face_annotations

def highlight_faces(image, faces):
    # Draws a polygon around the faces, then saves to output_filename.
    faces_boxes = []
    im = Image.open(image)
    draw = ImageDraw.Draw(im)

    for face in faces:
        box = [(vertex.x, vertex.y)
               for vertex in face.bounding_poly.vertices]
        draw.line(box + [box[0]], width=5, fill='#00ff00')
        boxed_faces.append([box[0][0], box[0][1], box[1][0] - box[0][0], box[3][1] - box[0][1]])
    return (boxed_faces)

In [None]:
extract_image_from_video()

TypeError: Required argument 'command' (pos 1) not found

# camera angle

# indoor vs outdoor

# character speaking

# location

# music presence

# name - focus

# words on screen