### Transcript downloader

In [6]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from youtube_transcript_api import YouTubeTranscriptApi
from tqdm import tqdm

# Load data from CSV
data = pd.read_csv("PsySym/data/datastoreOZP/dvlog.csv")

# Function to fetch transcript for a video key
def fetch_transcript(key):
    try:
        all_text = YouTubeTranscriptApi.get_transcript(key)
        transcript = " ".join([text['text']+'.' for text in all_text])
        return key, [transcript]
    except Exception as e:
        return key, None

# Use ThreadPoolExecutor for asynchronous fetching
with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(fetch_transcript, data['key']), total=len(data)))

# Create a dictionary from the results
transcript_dict = dict(results)

# Add the transcripts to the DataFrame
data['text'] = data['key'].map(transcript_dict)



100%|██████████| 961/961 [02:57<00:00,  5.43it/s]


### Formatting/ Preprocessing


In [7]:
id = []
posts = []
diseases = []

for index, row in data.iterrows():
    if row["text"] != None:
        id.append(row['video_id'])
        posts.append(row['text'])
        diseases.append(['depression', 'anxiety', 'autism', 'adhd', 'schizophrenia', 'bipolar', 'ocd', 'ptsd', 'eating'])

jsondata = pd.DataFrame()
jsondata['id'] = id
jsondata['posts'] = posts
jsondata['diseases'] = diseases

# Save the updated DataFrame to a JSON file
jsondata.to_json("Psysym/data/datastoreOZP/dvlog_wtext.json", orient='records', lines=True)