In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import requests
import re
from pathlib import Path

In [None]:
# Extract the generated video ids and labels from the raw dataset
def preprocess_video_record(video_raw_record):
    vid_ids = []
    labels = []

    for example in tf.python_io.tf_record_iterator(video_raw_record):
        tf_example = tf.train.Example.FromString(example)

        vid_ids.append(tf_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
        labels.append(list(tf_example.features.feature['labels'].int64_list.value))
    return vid_ids, labels

In [None]:
# Construct the uri that used to find the real video id
def construct_uri(video_id):
    # Extract the first two characters from the video ID
    prefix = video_id[:2]
    # Construct the URI by following the pattern /AB/ABCD.js
    uri = f"/{prefix}/{video_id}.js"
    return uri

In [None]:
# Use the uri to get the real video id
def get_youtube_video_id(url):
    response = requests.get(url)
    if response.status_code == 200:
        match = re.search(r'i\("(\w+)","([\w\-_]+)"\);', response.text)
        if match:
            return match.group(2)
    return None

In [None]:
# Use the YouTube Data API and video id to get the titles and descriptions
def get_video_details(video_id, api_key):
    url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={api_key}&part=snippet"
    title = description = None
    response = requests.get(url)
    video_details = response.json()
    if not video_details['items']:
        return None, None
    title = video_details['items'][0]['snippet']['title']
    description = video_details['items'][0]['snippet']['description']


    return title, description

In [None]:
def get_description_and_title(file, targets, api_key, data):
    vid_ids, labels = preprocess_video_record(file)
    for target in targets:
        matching_vid_ids = []
        # find the data with the target label
        for label_list, vid_id in zip(labels, vid_ids):
            if any(idx_label_dict.get(label) == target for label in label_list):
                matching_vid_ids.append(vid_id)

        matching_vid_ids_set = set(matching_vid_ids)
        # avoid duplicate data
        vid_ids = [vid_id for vid_id in vid_ids if vid_id not in matching_vid_ids_set]
        youtube_video_id_list = []
        for video_id in matching_vid_ids:
            # Construct uri
            uri = construct_uri(video_id)
            # base_url = "data.yt8m.org/2/j/i"
            # full_url = f"{base_url}{uri}"
            url = "http://data.yt8m.org/2/j/i" + uri
            youtube_video_id = get_youtube_video_id(url)
            # print(youtube_video_id)
            youtube_video_id_list.append(youtube_video_id)
            if youtube_video_id:
                title, description = get_video_details(youtube_video_id, api_key)
                # Filter out invaled title and description
                if title and description:
                    data[target]['titles'].append(title)
                    data[target]['descriptions'].append(description)
    return data

In [None]:
# Traverse the data files
def traverse_dir_with_pathlib(directory):
    path = Path(directory)
    for file_path in path.rglob('*'):
        paths.append(str(file_path))

In [None]:
# Read the file containing the label and the label number
vacabulary = r"C:\Users\Yurio\Downloads\vocabulary (1).csv"
df = pd.read_csv(vacabulary)

idx_to_label = df[['Index', 'Vertical1']]
# idx_to_label.set_index('Index', inplace=True)
idx_label_dict = idx_to_label.set_index('Index')['Vertical1'].to_dict()

In [None]:
processed_files = []
processed_data = []

In [None]:
paths = []
# targets = ["Sports", "Games", "Arts & Entertainment"]
# Target labels
targets = ["Pets & Animals", "Food & Drink", "Autos & Vehicles"]
# API keys for the YouTube Data API
# api_key = 'AIzaSyAOLdiQ5kv3u2JGZ_u5P-ZdOFwsEbX95Ig'
# api_key = 'AIzaSyA4Zz2kcm6tDg34v1Y3w1lUwLkX1draVlU'
# api_key = 'AIzaSyDTtC9PTcpWMzjLW98eW4UMGeshLHnY8a0'
api_key = 'AIzaSyDQndzS-x7wr-CbYyZxITfptl4JWtvT4oQ'

directory_path = r"D:\CS\pythonProject\youtube8m\0"
traverse_dir_with_pathlib(directory_path)
# Store the data in th
ds = {target: {'titles': [], 'descriptions': []} for target in targets}
# ds = processed_data
for file in paths:
    if len(targets) == 0:
        break
    if file in processed_files:
        continue
    processed_files.append(file)
    print(file)
    ds = get_description_and_title(file, targets, api_key, ds)
    for target in targets:
        if len(ds[target]['titles']) >= 3000:
            targets.remove(target)
        print(target + ":" + str(len(ds[target]['titles'])))

D:\CS\pythonProject\youtube8m\0\trainCN.tfrecord
Pets & Animals:35
Food & Drink:38
Autos & Vehicles:117
D:\CS\pythonProject\youtube8m\0\trainCV.tfrecord
Pets & Animals:62
Food & Drink:73
Autos & Vehicles:220
D:\CS\pythonProject\youtube8m\0\trainCZ.tfrecord
Pets & Animals:97
Food & Drink:105
Autos & Vehicles:312
D:\CS\pythonProject\youtube8m\0\traindg.tfrecord
Pets & Animals:130
Food & Drink:135
Autos & Vehicles:408
D:\CS\pythonProject\youtube8m\0\traindH.tfrecord
Pets & Animals:157
Food & Drink:164
Autos & Vehicles:525
D:\CS\pythonProject\youtube8m\0\traine0.tfrecord
Pets & Animals:187
Food & Drink:201
Autos & Vehicles:624
D:\CS\pythonProject\youtube8m\0\trainek.tfrecord
Pets & Animals:227
Food & Drink:237
Autos & Vehicles:707
D:\CS\pythonProject\youtube8m\0\trainfc.tfrecord
Pets & Animals:261
Food & Drink:258
Autos & Vehicles:822
D:\CS\pythonProject\youtube8m\0\trainFE.tfrecord
Pets & Animals:296
Food & Drink:293
Autos & Vehicles:915
D:\CS\pythonProject\youtube8m\0\trainFF.tfrecord
Pe

In [None]:
import json

file_path = "data_new_4.json"

# Write the dictionary to JSON file
with open(file_path, "w") as json_file:
    json.dump(ds, json_file, indent=4)