# Ad Video Annotation

Create a root directory with a suitable name and create a sub directory named script and add the following code the folder.

__Importing the necessary libraries__

In [None]:
import sys
import numpy as np
import cv2
import os
import os.path
import moviepy.editor as mp
from pydub import AudioSegment
import pytesseract
from PIL import Image
import pandas as pd
import speech_recognition as sr
import wx
import re
from imageai.Detection import VideoObjectDetection
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import cv2
import time
import argparse
import numpy as np

In [None]:
print("VERSIONS OF LIBRARIES USED")
print("Numpy : " + np.__version__)
print("OpenCV : " + cv2.__version__)
print("Pandas : " + pd.__version__)
print("PIL : " + Image.__version__)
print("Speech Recognition : " + sr.__version__)
print("wx : " + wx.__version__)
print("ibm watson : " + ibm_watson.__version__)

__Getting the root folder__

In [None]:
script_path = os.path.abspath(os.getcwd())
root_folder = os.path.abspath(os.path.join(script_path, os.pardir))
print("The root folder is: " + root_folder)

__Check whether the Video_ad folder is present in the root directory and if not create one and ask the user to paste the videos in that folder__

__Video_ad : Name of the folder with input videos__

In [None]:
video_ad = root_folder + "\\Video_ad"

if not os.path.exists(video_ad):
    print("The folder Video_ad is created. Please paste the input videos in this folder")
    os.makedirs(video_ad)
else:
    print("The folder Video_ad already exists. Please paste the input videos in this folder")

__Creating directories for extracted audio, extracted frames and processed frames__

Function to create sub directories

In [None]:
def create_dir(sub_dir):
    if not os.path.exists(sub_dir):
        print("The folder {} is created.".format(sub_dir))
        os.makedirs(sub_dir)
    else:
        print("The folder {} already exists.".format(sub_dir))

In [None]:
extracted_audio = root_folder + "\\extracted_audio"
raw_audio = root_folder + "\\raw_audio"
extracted_frames = root_folder + "\\extracted_frames"
processed_frames = root_folder + "\\processed_frames"
grayscale_frames = processed_frames + "\\grayscale_frames"
scaled_frames = processed_frames + "\\scaled_frames"
speech_text = root_folder + "\\speech_text"
obj_detection = root_folder + "\\object_detection"

create_dir(extracted_audio)
create_dir(extracted_frames)
create_dir(processed_frames)
create_dir(grayscale_frames)
create_dir(scaled_frames)
create_dir(raw_audio)
create_dir(speech_text)
create_dir(obj_detection)

__Check for videos in the input folder and cross check the format__

Separate the video filename and video extension from the input video folder

In [None]:
def filename_ext(video_path):
    filename = os.path.splitext(video_path)[0]
    extension = os.path.splitext(video_path)[1]
    return filename, extension

Add valid video path to a list 

In [None]:
def valid_video(video_dir):
    video_path_list= []
    
    valid_video_extensions = [".mp4" , "avi"]
    valid_vdo_extensions = [item.lower() for item in valid_video_extensions]
    
    for file in os.listdir(video_dir):
        filename, extension = filename_ext(file)
        if extension.lower() not in valid_vdo_extensions:
            print("{} does not have a valid extension".format(file))
            continue
        video_path_list.append((os.path.join(video_dir, file), filename))
        print(filename)
    return video_path_list

In [None]:
video_path_list = valid_video(video_ad)

__Extract Images from the individual videos__

Function to convert frames to grayscale

In [None]:
def grayscale(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return gray_image

Function to scale the frames

In [None]:
def scaled_images(image):
    h, w = image.shape
    print('width:  ', w)
    print('height: ', h)
    height, width = image.shape
    if height * width > 2120 * 1590:
        image = cv2.resize(image, (1200, 1200))
        print(image.shape)
    return image

In [None]:
def extract_images(video_path, filename):
    count = 0
    video_capture = cv2.VideoCapture(video_path)
    success,image = video_capture.read()
    success = True
    w_base_filename = ""
    image_suffix = '.jpg'
    while success:
        video_capture.set(cv2.CAP_PROP_POS_MSEC,(count*1000))
        success,image = video_capture.read()
        image_last = cv2.imread("frame{}.png".format(count-1))
        if np.array_equal(image,image_last):
            break
        
        output_folder = extracted_frames + "\\" + filename
        output_folder_grayscale = grayscale_frames + "\\" + filename
        output_folder_scaled = scaled_frames + "\\" + filename
        
        create_dir(output_folder)
        create_dir(output_folder_grayscale)
        create_dir(output_folder_scaled)
        
        image_name = "\\"+ filename + "frame" + str(count) + image_suffix
        grayscale_image_name = "\\"+ filename + "gray_scale_frame" + str(count) + image_suffix
        scaled_image_name = "\\"+ filename + "scaled_grayscale_frame" + str(count) + image_suffix
        
        output = os.path.join(output_folder + image_name)
        output_grayscale = os.path.join(output_folder_grayscale + grayscale_image_name)
        output_scaled = os.path.join(output_folder_scaled + scaled_image_name)
        
        grayscale_image = grayscale(image)
        scaled_image = scaled_images(grayscale_image)
        
        
        cv2.imwrite(output, image)
        cv2.imwrite(output_grayscale, grayscale_image)
        cv2.imwrite(output_scaled, scaled_image)
        count = count + 1

__Extract the audio from the video file__

Function for normalization of peak volume

In [None]:
def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)

Function to end the audio rendering session

In [None]:
def close_clip(clip):
    try:
        clip.reader.close()
        del clip.reader
        if clip.audio != None:
            clip.audio.reader.close_proc()
            del clip.audio
        del clip
    except Exception as e:
        sys.exc_clear() 

Function to extract and write the audio files

In [None]:
def audio_extraction(video_path, filename):
    audio_suffix = '.wav'
    
    #Filename, folder and path to be saved
    output_audio_filename = "\\" + filename + audio_suffix
    output_audio_folder = raw_audio + "\\" + filename
    create_dir(output_audio_folder)
    output_file = os.path.join(output_audio_folder + output_audio_filename)
    
    
    video_clip = mp.VideoFileClip(video_path)
    video_clip.audio.write_audiofile(output_file)
    close_clip(video_clip)
    
    audio = AudioSegment.from_file(output_file, "wav")
    normalized_audio = match_target_amplitude(audio, -20.0)
    channel_audio = normalized_audio.set_channels(1)
    
    processed_file = os.path.join(extracted_audio + output_audio_filename)
    channel_audio.export(processed_file, format="wav")
    

__Extracting the frames and audio from the ad__

In [None]:
for file in video_path_list:
    file_path = file[0]
    filename = file[1]
    
    extract_images(file_path, filename)
    audio_extraction(file_path, filename)

__OCR Implementation__

Function to find out valid images 

In [None]:
def valid_image(image_dir):
    image_path_list= []
    
    valid_image_extensions = [".jpg", ".jpeg", ".png", ".tif", ".tiff"]
    valid_img_extensions = [item.lower() for item in valid_image_extensions]
    
    for file in os.listdir(image_dir):
        filename, extension = filename_ext(file)
        if extension.lower() not in valid_img_extensions:
            print("{} does not have a valid extension".format(file))
            continue
        image_path_list.append((os.path.join(image_dir, file), filename))
        print(filename)
    return image_path_list

Defining a function to carryout OCR on the images extracted from the video

In [None]:
def ocr(image_path):
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    extracted_text = pytesseract.image_to_string(Image.open(image_path))
    return extracted_text

Funtion to calculate width and height of the extracted text

In [None]:
def dimension_cal(text, fontname, fontsize):
    app = wx.App()
    dc = wx.ScreenDC()
    dc.SetFont(wx.Font(fontsize,wx.FONTFAMILY_DEFAULT, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_BOLD, False, faceName=fontname))
    dim = dc.GetTextExtent(text)
    return dim

Create a directory named OCRtext to store the csv files of text extracted from each video file

In [None]:
ocrtext = root_folder + "\\OCRtext"

#Calling the function to create a directory
create_dir(ocrtext)

Function to carry out OCR process and return the csv dictionary containing text for each image

In [None]:
def ocr_exec(image_path, filename):
    extracted_text = ocr(image_path)
    with open('temp_text.txt', 'w',5 ,'utf-8') as text_file:
        text_file.write(extracted_text)
    temp_file = open('temp_text.txt', 'r', encoding="utf8") 
    Lines = temp_file.readlines()
    CSV_dict = []
    count = 1                
    
    
    for line in Lines: 
        if re.match(r'^\s*$', line):
            continue
        line = line.strip()
        text_size = dimension_cal(line, "Calibri", 11)
        word_width= text_size[0]
        word_height= text_size[1]
        
        data_dict = []

        data_dict.append(count)
        data_dict.append(filename)
        data_dict.append(line)  # Text 
        data_dict.append(word_width)
        data_dict.append(word_height)
       
        
        CSV_dict.append(data_dict)
        
        if not CSV_dict:
            continue
            
        count = count +1
    
    return CSV_dict

Executing the processes and functions required for extracting text and storing it to the OCRtext directory.

In [None]:
for file in os.listdir(scaled_frames):
    video_sort = os.path.join(scaled_frames, file)
    
    #Checking whether the image formats are valid using the function valid_image
    image_path_list = valid_image(video_sort)
    
    #Creating folder for storing the csv file according to video names
    video_folder_name = file
    folder_path = ocrtext + "\\" + video_folder_name
    create_dir(folder_path)
    
    #Going through each image in a scaled images folder of each video
    for file in image_path_list:
        file_path = file[0]
        filename = file[1]
        
        csv_dict = ocr_exec(file_path, filename)
        column_names = ["Line_No","File_name" ,"Text","Word_width" , "Word_height"]
        ocr_df = pd.DataFrame.from_records(csv_dict, columns=column_names)
        csv_name = folder_path + "\\" + filename + "_ocr_text.csv"
        ocr_df.to_csv(csv_name, sep=',' , index=False)

In [None]:
 def sp_txt(extracted_audio):
    sp_txt = []
    for file in os.listdir(extracted_audio):
        path = os.path.join(extracted_audio, file)
        r = sr.Recognizer()
    
        with sr.AudioFile(path) as source: # use "test.wav" as the audio source
            audio = r.listen(source,True) # extract audio data from the file
            try:
                print("Transcribing")
                text = r.recognize_google(audio)
            
                info = []
                info.append(path)
                info.append(text)
            
                sp_txt.append(info)
                print(text)
            except:
                print("Error")
    print (sp_txt)
    return sp_txt

In [None]:
sp_txt_data = sp_txt(extracted_audio)
column_names = ["File_name" ,"Text"]
ocr_df = pd.DataFrame.from_records(sp_txt_data, columns=column_names)
csv_name = speech_text + "\\" +  "speech_text.csv"
ocr_df.to_csv(csv_name, sep=',' , index=False)

In [None]:
execution_path = os.getcwd()

detector = VideoObjectDetection()
detector.setModelTypeAsYOLOv3()
detector.setModelPath( os.path.join(execution_path , "yolo.h5"))
detector.loadModel()

In [None]:
def obj_detection(input_path, output_path):
    for file in os.listdir(input_path):
        video_path = detector.detectObjectsFromVideo(input_file_path=os.path.join( input_path, file),
                                output_file_path=os.path.join(output_path, file)
                                , frames_per_second=29, log_progress=True)
        print(video_path)

In [None]:
obj_detection(video_ad, obj_detection)

In [None]:
given_tag = 'Energy lives here'
extracted = ['Enamel is the strong white outer layer','this is electricity this is this is chamber ko thats me this is something is researching and external users to capita carbon emissions powerplay reducing CO2 emissions were also producing energy lives here', 'enamel is the strong wind or reliability with surface the thing is really important sentences is to make sure that the name of things strong and resilient for lifetime the more that we can strengthen and recorded in that used the word efficiency that they really want to recommend for strong and to strengthen and we had in the name of the has the investigation it can I give their patients the protection that they need and the virus']
print(process.extract(given_tag, extracted))

### Speech To Text : IBM Watson

In [None]:
def WriteTextFetchedFromFramesToCSV(audio_folder):
    apikey = "Jy87VR2jwgfVfD0iGh68YNbCgQBi3enVE9Hb_Nmh0VkD"
    url = "https://api.eu-gb.speech-to-text.watson.cloud.ibm.com/instances/1882cc8e-4e5c-48a0-a63c-3658d1a88817"
    stt_folder = CreateFolderIfNotExists(os.path.join(audio_folder,"SPEECHTEXT"))
    authenticator = IAMAuthenticator(apikey)
    stt = SpeechToTextV1(authenticator=authenticator)
    stt.set_service_url(url)
    for file in os.listdir(audio_folder):
        basefilename, file_extension = GetBaseFileNameAndExtension(file)
        if(file_extension not in valid_audio_extensions):
            continue
        with open(os.path.join(audio_folder,file),'rb') as source:
            res = stt.recognize(audio=source, content_type="audio/wav", model="en-US_NarrowbandModel", continuous=True).get_result()
            CSV_data = list()
            transcripted_text = list()
            for i in range(len(res.get('results'))):
                line_data = list()
                transcript = res.get('results')[i].get('alternatives')[0].get('transcript')
                transcripted_text.append(transcript)
                confidence = res.get('results')[i].get('alternatives')[0].get('confidence')
                line_data.append(i+1)
                line_data.append(basefilename)
                line_data.append(transcript)
                line_data.append(confidence)
                CSV_data.append(line_data)
            print(process.extract(given_tag, transcripted_text))
            if not CSV_data:
                    continue
            col_names = ["LineNo","FileName" ,"Transcript","Confidence"]
            df_SpeechText = pd.DataFrame.from_records(CSV_data, columns=col_names)
            df_SpeechText.to_csv(os.path.join(stt_folder,basefilename+"_speechText.csv"), sep=',' , index=False)

In [None]:
WriteTextFetchedFromFramesToCSV(extracted_audio)

### Object Detection : YOLO

In [None]:
def yolov3(yolo_weights, yolo_cfg, coco_names):
    net = cv2.dnn.readNet(yolo_weights, yolo_cfg)
    classes = open(coco_names).read().strip().split("\n")
    layer_names = net.getLayerNames()
    output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]

    return net, classes, output_layers

#### This function is used to perform object detection keeping threshold value 0.5 

In [None]:
def perform_detection(net, img, output_layers, w, h, confidence_threshold):
    blob = cv2.dnn.blobFromImage(img, 1 / 255., (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    layer_outputs = net.forward(output_layers)

    boxes = []
    confidences = []
    class_ids = []

    for output in layer_outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]

            # Object is deemed to be detected
            if confidence > confidence_threshold:
               
                center_x, center_y, width, height = list(map(int, detection[0:4] * [w, h, w, h]))
               

                top_left_x = int(center_x - (width / 2))
                top_left_y = int(center_y - (height / 2))

                boxes.append([top_left_x, top_left_y, width, height])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    return boxes, confidences, class_ids


#### This function is to draw the boxes aroud the objects detected

In [None]:
def draw_boxes(boxes, confidences, class_ids, classes, img, colors, confidence_threshold, NMS_threshold):

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, confidence_threshold, NMS_threshold)

    FONT = cv2.FONT_HERSHEY_SIMPLEX

    if len(indexes) > 0:
        for i in indexes.flatten():
            x, y, w, h = boxes[i]
            
            color = colors[i]
            cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
  
            text = "{}: {:.4f}".format(classes[class_ids[i]], confidences[i])
            cv2.putText(img, text, (x, y - 5), FONT, 0.5, color, 2)

    cv2.imshow("Detection", img)

In [None]:
def detection_video_file(webcam, video_path, yolo_weights, yolo_cfg, coco_names, confidence_threshold, nms_threshold):
    net, classes, output_layers = yolov3(yolo_weights, yolo_cfg, coco_names)
    colors = np.random.uniform(0, 255, size=(len(classes), 3))

    if webcam:
        video = cv2.VideoCapture(0)
        time.sleep(2.0)
    else:
        video = cv2.VideoCapture(video_path)

    while True:
        ret, image = video.read()
        h, w, _ = image.shape
        boxes, confidences, class_ids = perform_detection(net, image, output_layers, w, h, confidence_threshold)
        draw_boxes(boxes, confidences, class_ids, classes, image, colors, confidence_threshold, nms_threshold)

        key = cv2.waitKey(1) & 0xFF
        if key == ord("q"):
            break

    video.release()

In [None]:
webcam = 0
video_path = ""
yolo_weights = "yolov3.weights"
yolo_cfg = "yolov3.cfg"
coco_names = "coco_names.txt"
confidence_threshold = 0.5
nms_threshold = 0.5

for file in os.listdir(video_ad):
    video_path = os.path.join(video_ad,file)
    detection_video_file(webcam,video_path,yolo_weights,yolo_cfg,coco_names,confidence_threshold,nms_threshold)