In [1]:
import subprocess
import sys
import os
import json
import io
import termcolor
import re
from tqdm import tqdm
from youtube_helpers import get_hash, getsize
from utils import extract_audio_part_segment
from filters import Pipeline, OverlappingSubtitlesRemover, SubtitleCaptionTextFilter, SubtitleMerger,\
    CaptionLengthFilter, CaptionRegexMatcher, CaptionDurationFilter, CaptionLeaveOnlyAlphaNumCharacters, CaptionNormalizer
from youtube_helpers import load_all_subtitles

class RESULT:
    GOOGLE_TEST_NOT_PASSED = 0
    OK = 1


good_chars_regexp = re.compile(r"^[A-Za-z0-9\,\.\-\?\"\'\’\!\“\s\;\:\“\”\–\‘\’\’\/\\]+$", re.IGNORECASE)
pipeline = Pipeline([
    OverlappingSubtitlesRemover(),
    SubtitleCaptionTextFilter(),
    CaptionNormalizer(),
    CaptionRegexMatcher(good_chars_regexp),
    CaptionLengthFilter(min_length=5),
    CaptionLeaveOnlyAlphaNumCharacters(),
    SubtitleMerger(max_len_merged_sec=10),
    CaptionDurationFilter(min_length=1, max_length=20.0)
])

if __name__ == "__main__":    
    video_file = sys.argv[1]
    basename=os.path.basename(video_file)
    Name=basename.replace('.mp4','')
    target_dir = sys.argv[2]

    subtitle_file = video_file.replace('.mp4', '.en.vtt')
    info_file = video_file.replace('.mp4', '.info.json')
    overall_info = {"sub_file" : subtitle_file, "info" : info_file}
    log_file = open("./log.json", "a+")

    result = RESULT.OK
    try:
        if not os.path.exists(subtitle_file) or not os.path.exists(info_file):
            termcolor.cprint("Subtitle file or Info files do not exist. {}".format(video_file), color="red" )
            raise Exception("Subtitle file or Info files do not exist.")

        #Download google subtitle to cross check with closed captions
        with open(info_file) as f:
            metadata = json.load(f)
        #youtube_link = metadata['webpage_url']
        print("Parsing subtitle")
        subtitles = load_all_subtitles(subtitle_file)
        print(len(subtitles))
        input = {
            'subtitles': subtitles,
            'video_file': video_file
        }
        overall_info["num_subtitles"] = len(subtitles)
        termcolor.cprint("Got {} candidates".format(len(subtitles)), color="yellow")


        filtered_input = pipeline(input)
        filtered_subtitles = filtered_input["subtitles"]

        termcolor.cprint("Writing {} samples".format(len(filtered_subtitles)), color="cyan")

        count=1
        target_txt_file = os.path.join(target_dir, "Transcript.txt")
        wav_file_dir = os.path.join(target_dir, "wav")
        metadata_dir = os.path.join(target_dir, "metadata")
        os.makedirs(wav_file_dir, exist_ok=True)
        #os.makedirs(metadata_dir, exist_ok=True)

        for t in tqdm(filtered_subtitles):
            audio_name=Name+'_'+str(count) 
            target_wav_file = os.path.join(wav_file_dir, audio_name+ ".wav")
            #target_metadata_file = os.path.join(metadata_dir, audio_name+ ".json")

            text = t["original_phrase"]
            if len(text) == 0:
                continue
            if not os.path.exists(target_wav_file) or not os.path.exists(target_txt_file):
                extract_audio_part_segment(video_file, t["ts_start"], t["ts_end"], target_wav_file)

                with io.open(target_txt_file, "a", encoding='utf-8') as f:
                    f.write('\n'+audio_name+' '+text)

                #with io.open(target_metadata_file, "w", encoding='utf-8') as f:
                    #t["ts_start"] = str(t["ts_start"])
                    #t["ts_end"] = str(t["ts_end"])
                    #t["metadata"] = metadata
                    #json.dump(t, f)

                assert os.path.exists(target_txt_file) and os.path.exists(target_wav_file) \
                       and getsize(target_wav_file) > 4 * 1024, "{} not created".format(target_wav_file)
                count+=1
    except Exception as e:
        termcolor.cprint(e, color="red")
    finally:
        overall_info["result"] = result
        log_file.write(json.dumps(overall_info) + "\n")
        log_file.flush()
        log_file.close()
        #if os.path.exists(video_file):
        #    os.remove(video_file)
            #if os.path.exists(subtitle_file):
            #    os.remove(subtitle_file)
            #if os.path.exists(info_file):
            #    os.remove(info_file)

In [3]:
class RESULT:
    GOOGLE_TEST_NOT_PASSED = 0
    OK = 1


good_chars_regexp = re.compile(r"^[A-Za-z0-9\,\.\-\?\"\'\’\!\“\s\;\:\“\”\–\‘\’\’\/\\]+$", re.IGNORECASE)
pipeline = Pipeline([
    OverlappingSubtitlesRemover(),
    SubtitleCaptionTextFilter(),
    CaptionNormalizer(),
    CaptionRegexMatcher(good_chars_regexp),
    CaptionLengthFilter(min_length=5),
    CaptionLeaveOnlyAlphaNumCharacters(),
    SubtitleMerger(max_len_merged_sec=10),
    CaptionDurationFilter(min_length=1, max_length=20.0)
])

In [12]:
if __name__ == "__main__":    
    video_file = sys.argv[1]
    basename=os.path.basename(video_file)
    Name=basename.replace('.mp4','')
    target_dir = sys.argv[2]

    subtitle_file = video_file.replace('.mp4', '.en.vtt')
    info_file = video_file.replace('.mp4', '.info.json')
    overall_info = {"sub_file" : subtitle_file, "info" : info_file}
    log_file = open("./log.json", "a+")

    result = RESULT.OK
    try:
        if not os.path.exists(subtitle_file) or not os.path.exists(info_file):
            termcolor.cprint("Subtitle file or Info files do not exist. {}".format(video_file), color="red" )
            raise Exception("Subtitle file or Info files do not exist.")

        #Download google subtitle to cross check with closed captions
        with open(info_file) as f:
            metadata = json.load(f)
        #youtube_link = metadata['webpage_url']
        print("Parsing subtitle")
        subtitles = load_all_subtitles(subtitle_file)
        print(len(subtitles))
        input = {
            'subtitles': subtitles,
            'video_file': video_file
        }
        overall_info["num_subtitles"] = len(subtitles)
        termcolor.cprint("Got {} candidates".format(len(subtitles)), color="yellow")


        filtered_input = pipeline(input)
        filtered_subtitles = filtered_input["subtitles"]

        termcolor.cprint("Writing {} samples".format(len(filtered_subtitles)), color="cyan")

        count=1
        target_txt_file = os.path.join(target_dir, "Transcript.txt")
        wav_file_dir = os.path.join(target_dir, "wav")
        metadata_dir = os.path.join(target_dir, "metadata")
        os.makedirs(wav_file_dir, exist_ok=True)
        #os.makedirs(metadata_dir, exist_ok=True)

        for t in tqdm(filtered_subtitles):
            audio_name=Name+'_'+str(count) 
            target_wav_file = os.path.join(wav_file_dir, audio_name+ ".wav")
            #target_metadata_file = os.path.join(metadata_dir, audio_name+ ".json")

            text = t["original_phrase"]
            if len(text) == 0:
                continue
            if not os.path.exists(target_wav_file) or not os.path.exists(target_txt_file):
                extract_audio_part_segment(video_file, t["ts_start"], t["ts_end"], target_wav_file)

                with io.open(target_txt_file, "a", encoding='utf-8') as f:
                    f.write('\n'+audio_name+' '+text)

                #with io.open(target_metadata_file, "w", encoding='utf-8') as f:
                    #t["ts_start"] = str(t["ts_start"])
                    #t["ts_end"] = str(t["ts_end"])
                    #t["metadata"] = metadata
                    #json.dump(t, f)

                assert os.path.exists(target_txt_file) and os.path.exists(target_wav_file) \
                       and getsize(target_wav_file) > 4 * 1024, "{} not created".format(target_wav_file)
                count+=1
    except Exception as e:
        termcolor.cprint(e, color="red")
    finally:
        overall_info["result"] = result
        log_file.write(json.dumps(overall_info) + "\n")
        log_file.flush()
        log_file.close()
        #if os.path.exists(video_file):
        #    os.remove(video_file)
            #if os.path.exists(subtitle_file):
            #    os.remove(subtitle_file)
            #if os.path.exists(info_file):
            #    os.remove(info_file)




  0%|          | 0/122 [00:00<?, ?it/s][A[A[A


  1%|          | 1/122 [00:00<00:18,  6.53it/s][A[A[A

Parsing subtitle
175
[33mGot 175 candidates[0m
[36mWriting 122 samples[0m





  2%|▏         | 2/122 [00:00<00:19,  6.29it/s][A[A[A


  2%|▏         | 3/122 [00:00<00:19,  6.20it/s][A[A[A


  3%|▎         | 4/122 [00:00<00:20,  5.75it/s][A[A[A


  4%|▍         | 5/122 [00:00<00:21,  5.38it/s][A[A[A


  5%|▍         | 6/122 [00:01<00:23,  5.00it/s][A[A[A


  6%|▌         | 7/122 [00:01<00:24,  4.74it/s][A[A[A


  7%|▋         | 8/122 [00:01<00:25,  4.48it/s][A[A[A


  7%|▋         | 9/122 [00:01<00:27,  4.14it/s][A[A[A


  8%|▊         | 10/122 [00:02<00:29,  3.80it/s][A[A[A


  9%|▉         | 11/122 [00:02<00:31,  3.52it/s][A[A[A


 10%|▉         | 12/122 [00:02<00:33,  3.26it/s][A[A[A


 11%|█         | 13/122 [00:03<00:36,  3.01it/s][A[A[A


 11%|█▏        | 14/122 [00:03<00:37,  2.90it/s][A[A[A


 12%|█▏        | 15/122 [00:04<00:38,  2.81it/s][A[A[A


 13%|█▎        | 16/122 [00:04<00:40,  2.62it/s][A[A[A


 14%|█▍        | 17/122 [00:04<00:41,  2.56it/s][A[A[A


 15%|█▍        | 18/122 [00:05<00:42,  2.46it