In [2]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from youtube_transcript_api import YouTubeTranscriptApi
from tqdm import tqdm
from sklearn.model_selection import train_test_split

### Transcript fetcher

In [3]:
# Load data from CSV
data = pd.read_csv("../data/dvlog.csv")

# Function to fetch transcript for a video key
def fetch_transcript(key):
    try:
        all_text = YouTubeTranscriptApi.get_transcript(key)
        transcript = " ".join([text['text'] for text in all_text])
        return key, [transcript]
    except Exception as e:
        return key, None
        

# Use ThreadPoolExecutor for asynchronous fetching
with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(fetch_transcript, data['key']), total=len(data)))

# Create a dictionary from the results
transcript_dict = dict(results)

# Add the transcripts to the DataFrame
data['text'] = data['key'].map(transcript_dict)



100%|██████████| 961/961 [03:03<00:00,  5.23it/s]


### Drop missing transcripts


In [6]:
postdata = data.dropna()

# postdata.to_json('../data/postdatalines.json',orient='records', lines=True)

800

In [10]:
postdata = pd.read_json('../data/postdatalines.json',orient='records', lines=True)
print(len(postdata))
train_set, test_set = train_test_split(postdata, test_size=0.5, random_state=99)

train_set['split'] = 'train'
test_set['split'] = 'test'


full_set = pd.concat([train_set,test_set])

full_set.to_json('../data/splitData/postdataLinesSplit.json', orient='records',lines=True)



797


In [25]:
options = ['w2vec','DictionaryBased','TFIDF500','TFIDFEX500','TFIDFIN500','blingfire','reweightedblingfire','nltk','reweightednltk','spacysm','reweightedspacysm','spacylg','reweightedspacylg','spacytrf','reweightedspacytrf']

for name in options:
    postdata = pd.read_json(f'../data/vectorData/{name}Vectors.json',orient='records', lines=True)

    train_set, test_set = train_test_split(postdata, test_size=0.5, random_state=99)


    train_set['split'] = 'train'
    test_set['split'] = 'test'


    total = pd.concat([train_set,test_set])
    print(name)
    print(len(postdata))
    display(total['split'].value_counts())
    total.to_json(f'../data/vectorData/{name}Vectors.json', orient='records',lines=True)

w2vec
797


split
test     399
train    398
Name: count, dtype: int64

DictionaryBased
797


split
test     399
train    398
Name: count, dtype: int64

TFIDF500
797


split
test     399
train    398
Name: count, dtype: int64

TFIDFEX500
797


split
test     399
train    398
Name: count, dtype: int64

TFIDFIN500
797


split
test     399
train    398
Name: count, dtype: int64

blingfire
797


split
test     399
train    398
Name: count, dtype: int64

reweightedblingfire
797


split
test     399
train    398
Name: count, dtype: int64

nltk
797


split
test     399
train    398
Name: count, dtype: int64

reweightednltk
797


split
test     399
train    398
Name: count, dtype: int64

spacysm
797


split
test     399
train    398
Name: count, dtype: int64

reweightedspacysm
797


split
test     399
train    398
Name: count, dtype: int64

spacylg
797


split
test     399
train    398
Name: count, dtype: int64

reweightedspacylg
797


split
test     399
train    398
Name: count, dtype: int64

spacytrf
797


split
test     399
train    398
Name: count, dtype: int64

reweightedspacytrf
797


split
test     399
train    398
Name: count, dtype: int64