In [1]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import sys
import json
import requests
import time

In [3]:
API_KEY = "AIzaSyDlrOVHZ8stkQCS52a1qf2f06zrvwnVPGw"

In [4]:
base_url = "https://youtube.googleapis.com/youtube/v3/"
headers = {'Accept': 'application/json'}

In [5]:
def read_json_from_file(directory, filename):
    with open(os.path.join(directory, filename)) as json_file:
        data = json.load(json_file)
        
    return data

In [6]:
def write_json_to_file(directory, data, filename=None):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    if (filename is None):
        filename = '{0}.json'.format(time.time())

    with open(os.path.join(directory, filename), 'w') as f:
        json.dump(data, f, ensure_ascii=False)

In [7]:
def write_video_ids_to_url_file(directory, video_ids):
    if not os.path.exists(directory):
        os.makedirs(directory)

    filename = '{0}.csv'.format(time.time())
    
    file_path = os.path.join(directory, filename)
    
    with open(file_path, 'w') as f:
        for item in video_ids:
            f.write("http://www.youtube.com/watch?v={0}".format(item))
            f.write("\r\n")
            
    return file_path

In [8]:
def search(keyword, next_page_token):
    part = "snippet"
    max_results = 50
    res_type = "video"
    
    url = '{0}search?part={1}&maxResults={2}&type={3}&q={4}&key={5}'.format(
        base_url, 
        part,
        max_results, 
        res_type,
        keyword, 
        API_KEY
    )
    
    if (next_page_token is not None):
        url += '&pageToken={0}'.format(next_page_token)
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return json.loads(response.content.decode('utf-8'))
    else:
        print("HTTP not OK: {0}".format(response.content))
        return None 

In [9]:
def get_video_ids(search_data):
    
#     Initialize list
    video_ids = set()

#     Parse JSON
    for search_result in search_data["items"]:
        video_ids.add(search_result["id"]["videoId"]);

    return video_ids

In [10]:
def get_video_details(video_ids):
    
    csv_video_ids = ",".join(video_ids);
    
    part = "snippet,contentDetails"
    url = "{0}videos?part={1}&id={2}&key={3}".format(
        base_url,
        part,
        csv_video_ids,
        API_KEY
    )
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return json.loads(response.content.decode('utf-8'))["items"]
    else:
        print("HTTP not OK: {0}".format(response.content))
        return None

In [11]:
def youtube_downloader(url_file_path):
    # Invoke external YouTube downloader with YouTube URL file
    bashCommand = "youtube-dl -a {0} -j > output_video_traces.temp".format(url_file_path)

    print("Running youtube-dl: {0}".format(bashCommand))

    !{bashCommand}
    
    # Open output_video_details file and make a proper array out of it
    with open("output_video_traces.temp") as temp_file:
        json_strings = temp_file.readlines()

        json_array = []

        for json_str in json_strings:
            json_array.append(json.loads(json_str))

        # Save json array to disk
        write_json_to_file("traces", json_array)

In [12]:
def do_work(search_term, page_limit):
    
    # Keep track of what (unique) video IDs we have found
    video_ids_set = set(read_json_from_file("cache", "found_video_ids.json"))
    # Count previously found videos
    initial_video_ids_set_size = len(video_ids_set)
    
    next_page_token = None
    
    for i in range(page_limit):
        
        # Search YouTube API
        search_result = search(search_term, next_page_token)
        # Save search result
        write_json_to_file("searches", search_result)
        
        # Parse search result to get video ids
        found_video_ids = get_video_ids(search_result)
        
        # Find which found IDs are not duplicates
        unique_video_ids = found_video_ids.difference(video_ids_set)
        
        # Obtain video details (including projection method)
        video_details = get_video_details(unique_video_ids)
        # Save video details
        #write_json_to_file("video_details", list(unique_video_ids))
        
        # Find which of our video IDs refer to 360 videos
        for video_details_item in video_details:
            if (video_details_item['contentDetails']['projection'] != "360"):
                # This is not a 360 video; remove it from the set
                unique_video_ids.remove(video_details_item['id'])
        
        # We now have a local set (unique_video_ids) which contains unique 360 video IDs
        
        # Check if our set is not empty
        if (len(unique_video_ids) != 0):
            # Save video IDs as URLs to hand to external YouTube Downloader
            url_file_path = write_video_ids_to_url_file("videos", unique_video_ids)

            # Add found video IDs to our set (no duplicates)
            video_ids_set.update(unique_video_ids)

            # Download video traces for the URLs found
            youtube_downloader(url_file_path)
        
            print("Processed {0} out of {1} search pages. Found and stored {2} video traces so far.".format(
                i + 1,
                page_limit,
                len(video_ids_set) - initial_video_ids_set_size))
        
        # Check if there is a next page available
        if ('nextPageToken' in search_result):
            # Update next_page_token for subsequent searches
            next_page_token = search_result['nextPageToken']
        elif (i < page_limit - 1):
            # There are no more search results for us after this
            print("No more search results to explore for this search term: \"{0}\". Exiting...".format(search_term))
            break
        
    print("Done. Processed and stored a total of {0} new 360-degree video traces. (Total cached: {1})".format(
        len(video_ids_set) - initial_video_ids_set_size,
         len(video_ids_set)))
    
    return video_ids_set

In [21]:
processed_video_ids = do_work("360 sports", 1000)

Running youtube-dl: youtube-dl -a videos/1607429747.592078.csv -j > output_video_traces.temp
Processed 3 out of 1000 search pages. Found and stored 5 video traces so far.
Running youtube-dl: youtube-dl -a videos/1607429753.5197492.csv -j > output_video_traces.temp
Processed 4 out of 1000 search pages. Found and stored 7 video traces so far.
Running youtube-dl: youtube-dl -a videos/1607429757.93767.csv -j > output_video_traces.temp
Processed 5 out of 1000 search pages. Found and stored 10 video traces so far.
Running youtube-dl: youtube-dl -a videos/1607429762.793486.csv -j > output_video_traces.temp
Processed 6 out of 1000 search pages. Found and stored 12 video traces so far.
Running youtube-dl: youtube-dl -a videos/1607429769.434012.csv -j > output_video_traces.temp
Processed 8 out of 1000 search pages. Found and stored 15 video traces so far.
Running youtube-dl: youtube-dl -a videos/1607429774.36975.csv -j > output_video_traces.temp
Processed 9 out of 1000 search pages. Found and st

TypeError: 'NoneType' object is not iterable

In [None]:
# Store video IDs so that subsequent runs will avoid duplicates
write_json_to_file("cache", list(processed_video_ids), "found_video_ids.json")