In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from youtube_transcript_api import YouTubeTranscriptApi
from tqdm import tqdm
import json
import blingfire
import spacy

### Transcript fetcher

In [None]:
# Load data from CSV
data = pd.read_csv("data/dvlog.csv")

# Function to fetch transcript for a video key
def fetch_transcript(key):
    try:
        all_text = YouTubeTranscriptApi.get_transcript(key)
        transcript = " ".join([text['text'] for text in all_text])
        return key, [transcript]
    except Exception as e:
        return key, None
        

# Use ThreadPoolExecutor for asynchronous fetching
with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(fetch_transcript, data['key']), total=len(data)))

# Create a dictionary from the results
transcript_dict = dict(results)

# Add the transcripts to the DataFrame
data['text'] = data['key'].map(transcript_dict)



### Drop missing transcripts


In [None]:
postdata = data.dropna()

postdata.to_json('../data/postdatalines.json',orient='records', lines=True)

### Sentence splitting
##### blingfire implementation

In [None]:
def process_json_file(filename):
    with open(filename, 'r') as f:
        data = json.load(f)

    for record in data:
        record_text = record['text']

        if isinstance(record_text, list):
            sentences = []
            for text_item in record_text:
                sentences.extend(cut_sentences(text_item))
            record['sentences'] = sentences
        else:
            sentences = cut_sentences(record_text)
            record['sentences'] = sentences

    with open('blingfire'+filename, 'w') as f:
        json.dump(data, f, indent=4)

cut_sentences = lambda x: blingfire.text_to_sentences(x.strip()).split("\n")



In [None]:
filename = 'postdata.json'

process_json_file(filename)

##### Spacy implementation

In [None]:
small = "en_core_web_sm"
large = "en_core_web_lg"
trf = "en_core_web_trf"

model = large

nlp = spacy.load(model)

def process_json_file(filename):
    with open(filename, 'r') as f:
        data = json.load(f)

    for record in tqdm(data):
        record_text = record['text']

        if isinstance(record_text, list):
            sentences = []
            for text_item in record_text:
                sentences.extend(process_transcript(text_item))
            record['sentences'] = sentences
        else:
            sentences = process_transcript(record_text)
            record['sentences'] = sentences

    with open(model+'.json', 'w') as f:
        json.dump(data, f, indent=4)

def process_transcript(transcript):

    transcript = transcript.lower().strip()

    doc = nlp(transcript)

    sentences = [sent.text.strip() for sent in doc.sents]

    return sentences

process_json_file(filename)


In [None]:
filename = 'postdata.json'

process_json_file(filename)