# imports

In [38]:
import glob, os, sys, io, skvideo.io, argparse, math, datetime, ffmpy, shutil, wikipedia, pandas
from google.cloud import videointelligence_v1p1beta1 as videointelligence
from google.cloud import vision
from google.cloud import storage
from google.cloud.vision import types
from google.oauth2 import service_account
from PIL import Image, ImageDraw
import numpy as np
import deepdish as dd

In [61]:
# credential path must be set outside function
credentials = service_account.Credentials.from_service_account_file('../../google-credentials/memory-dynamics-73f025c2a094.json')

# feature extraction with google cloud video intelligence api

In [2]:
def extract_features(movie_to_process, bucket_name):
    
    #path to video input file and JSON output file
    #input_uri = 'gs://' + bucket_name + '/video_processing/' + movie_to_process
    input_uri = 'gs://sherlock_movie/sherlock-mini-test.mp4'
    
    video_client = videointelligence.VideoIntelligenceServiceClient(credentials=credentials)
    
    # request extracted features
    features = [videointelligence.enums.Feature.LABEL_DETECTION, videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]

    # request shot-level and frame_level labels, assign to label detection config
    label_mode = videointelligence.enums.LabelDetectionMode.SHOT_AND_FRAME_MODE
    label_config = videointelligence.types.LabelDetectionConfig(label_detection_mode=label_mode)
    
    # set language context to British english FOR SHERLOCK
    speech_config = videointelligence.types.SpeechTranscriptionConfig(language_code='en-GB')
    
    # pass label detection and speech transcription configs to context 
    context = videointelligence.types.VideoContext(label_detection_config=label_config, speech_transcription_config=speech_config)

    # define analysis from parameters
    operation = video_client.annotate_video(input_uri, features=features, video_context=context)
    print('processing video')
    
    # set a timeout to check if operation is done
    result = operation.result(timeout=50000)
    print('finished processing video')
    
    return result
    

In [None]:
# test
test_result = extract_features('sherlock-mini-test.mp4','sherlock_movie')

In [47]:
from google.protobuf.json_format import MessageToDict, MessageToJson

In [48]:
asjson = MessageToJson(test_result)

In [58]:
#test_result.annotation_results[0]

In [54]:
asdict = MessageToDict(test_result.annotation_results[0])

In [56]:
asdict

{'frameLabelAnnotations': [{'entity': {'description': 'photograph',
    'entityId': '/m/068jd',
    'languageCode': 'en-US'},
   'frames': [{'confidence': 0.9491167664527893, 'timeOffset': '0.733627s'},
    {'confidence': 0.9492044448852539, 'timeOffset': '1.798869s'},
    {'confidence': 0.9538643956184387, 'timeOffset': '5.696047s'},
    {'confidence': 0.9483444094657898, 'timeOffset': '11.892254s'},
    {'confidence': 0.9477034211158752, 'timeOffset': '20.269384s'},
    {'confidence': 0.9528136849403381, 'timeOffset': '21.271281s'},
    {'confidence': 0.9494990706443787, 'timeOffset': '22.144528s'},
    {'confidence': 0.9504711627960205, 'timeOffset': '24.396657s'}]},
  {'entity': {'description': 'tourism',
    'entityId': '/m/07bxq',
    'languageCode': 'en-US'},
   'frames': [{'confidence': 0.4922598600387573, 'timeOffset': '0.733627s'},
    {'confidence': 0.515313982963562, 'timeOffset': '1.798869s'},
    {'confidence': 0.5465244650840759, 'timeOffset': '2.808807s'},
    {'confide

In [46]:
type(test_result.annotation_results[0])

google.cloud.videointelligence_v1p1beta1.types.VideoAnnotationResults

In [None]:
dd.io.save('../data/gcloud outputs/sherlock-mini-test', test_result)

## load in previously returned results

In [6]:
test_result = dd.io.load('../data/gcloud outputs/sherlock-mini-test')

In [None]:
# movie_to_process is sherlock_video.mp4
# bucket is sherlock_movie

# where do I want to output to?

# scene details

In [None]:
# needs work

# characters on screen (analogous to name - all)

### get frames with people in them

In [5]:
def get_person_frames(result):
    frame_offsets = []
    frame_labels = result.annotation_results[0].frame_label_annotations
    for i, frame_label in enumerate(frame_labels):
        for category_entity in frame_label.category_entities:
            if category_entity.description == 'person':
                frame1 = frame_label.frames[0]
                time_offset = (frame1.time_offset.seconds + frame1.time_offset.nanos / 1e9)
                frame_offsets.append(time_offset)
    return(sorted(set(frame_offsets)))

In [7]:
# test
person_frames = get_person_frames(test_result)

In [8]:
person_frames

[0.733627,
 4.660952,
 5.696047,
 8.995336,
 16.049651,
 20.269384,
 36.923075,
 42.104165,
 43.253441]

### extract image frames

In [9]:
# def functions called in main function

# clear previous tmp folders before starting
def cleanup():
    try:
        return os.rmdir('../images/tmp/')
    except:
        raise ValueError('error in cleanup: no tmp folder in that location')


# get image from video
def extract_image_from_video(video_input, name_output, time_stamp):
    ret = 'Error'
    try:
        ret = os.system('ffmpeg -i ' + video_input + ' -ss ' + time_stamp + ' -frames:v 1 ' + name_output)    
        # ret should = 0 if successful
        return ret
    except ValueError:
        return('Error in extract_image_from_video')


# crop around faces in extracted frames
def crop_image(input_image, output_image, start_x, start_y, width, height):
    
    input_img = Image.open(input_image)
    
    # crop with buffer space for mistakes & to for google vision to handle 
    start_with_buffer_x = int(start_x - np.ceil(width/2))
    start_with_buffer_y = int(start_y - np.ceil(height/2))
    width_with_buffer = int(start_x + width  + np.ceil(width/2))
    height_with_buffer = int(start_y + height  + np.ceil(height/2))

    box = (start_with_buffer_x, start_with_buffer_y, width_with_buffer, height_with_buffer)
    output_img = input_img.crop(box)
    output_img.save('../images/' + output_image + '.png')
    return ('../images/' + output_image + '.png')
    

# pass cropped images to google visioin
def detect_face(face_file, max_results=4):
    # get coordinates of face in frame
    client = vision.ImageAnnotatorClient(credentials=credentials)
    content = face_file.read()
    image = types.Image(content=content)

    # return face coords
    return client.face_detection(image=image).face_annotations


def highlight_faces(image, faces):
    # Draws a polygon around the faces, then saves to output_filename.
    faces_boxes = []
    im = Image.open(image)
    draw = ImageDraw.Draw(im)

    for face in faces:
        box = [(vertex.x, vertex.y)
               for vertex in face.bounding_poly.vertices]
        draw.line(box + [box[0]], width=5, fill='#00ff00')
        boxed_faces.append([box[0][0], box[0][1], box[1][0] - box[0][0], box[3][1] - box[0][1]])
    return (boxed_faces)


# ause google web detection client to do image search with face
def annotate(path):
    
    client = vision.ImageAnnotatorClient(credentials=credentials)
    
    if path.startswith('http') or path.startswith('gs:'):
        image = types.Image()
        image.source.image_uri = path
    else:
        with io.open(path, 'rb') as image_file:
            content = image_file.read()

        image = types.Image(content=content)

    web_detection = client.web_detection(image=image).web_detection

    return web_detection


def report(annotations, max_report=5):
    
    names =  []
    
    if annotations.web_entities:
        print ('{} Web entities found: '.format(len(annotations.web_entities)))
        count = 0
        for entity in annotations.web_entities:
            print('Score      : {}'.format(entity.score))
            print('Description: {}'.format(entity.description))
            names.append(entity.description)
            count += 1
            if count >=max_report:
                break
    return names


In [66]:
# MAIN FUNCTION
def get_names(movie_to_process, bucket_name, timestamps_to_pull):
    video_location = 'https://storage.googleapis.com/' + bucket_name + '/' + movie_to_process
    storage_client = storage.Client(credentials=credentials, project='memory-dynamics')
    max_results = 3

    # also try .15 seconds on either side of the pulled frame for a better shot at a result
    timestamps_to_pull_tmp = timestamps_to_pull + [i + 0.15 for i in timestamps_to_pull[:-1]] + [i - 0.15 for i in timestamps_to_pull[1:]]

    # clear out stills folder  
    if len(timestamps_to_pull_tmp) > 0:
        # create directory structure
        os.system('mkdir ../images/tmp')
        os.system('mkdir ../images/tmp/faces_found')
        os.system('mkdir ../images/tmp/text_found')
        os.system('mkdir ../images/tmp/face_images')

        filepath = '../images/tmp/'

        # make stills
        cnt = 0
        for ttp in timestamps_to_pull_tmp:
            # get the still image at that timestamp
            time_stamp = str(datetime.timedelta(seconds=ttp))
            file = "still_"  + str(cnt) + ".png"
            filePathAndName =  filepath + file
            ret = extract_image_from_video(video_input = video_location, name_output = filePathAndName, time_stamp = time_stamp)
            cnt += 1

            # find face on still image
            with open(filePathAndName, 'rb') as image:
                faces = detect_face(image, max_results)
                print('Found {} face{}'.format(len(faces), '' if len(faces) == 1 else 's'))

                print('Searching web detection for a face {}'.format(filePathAndName))
                # Reset  file pointer to read same file again
                image.seek(0)
                faces_boxes = highlight_faces(filePathAndName, faces)
                print('faces_boxes:', faces_boxes)

                if len(faces_boxes) > 0:
                    # image had a face

                    count = 0
                    for face_box in faces_boxes:
                        saved_name = crop_image(filePathAndName, "../images/tmp/faces_found" + file.split('.')[0] + str(count) + '_faces', face_box[0], face_box[1], face_box[2], face_box[3])
                        count += 1

                        # get actors name from 
                        potential_names = report(annotate(saved_name),5)
                        print('potential_names: ', potential_names)
                        new_name = '../images/' + ''.join([str(name) + '/' for name in potential_names]) + '.png'
                        shutil.copy(saved_name,new_name)


### putting it all together

In [67]:
movie_name = 'sherlock-mini-test.mp4'
bucket_name = 'sherlock_movie'

# clean up from last run
cleanup()

# test_result = extract_features(movie_name, bucket_name)
# person_frames = get_person_frames(test_result)
get_names(movie_name, bucket_name, person_frames)

print('finished!')

PermissionDenied: 403 Cloud Vision API has not been used in project 269332579786 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/vision.googleapis.com/overview?project=269332579786 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.

# camera angle

# indoor vs outdoor

# character speaking

# location

# music presence

# name - focus

# words on screen