# Packages

In [None]:
!pip install google-api-python-client
!pip install networkx



In [None]:
from googleapiclient.discovery import build
import json
import networkx as nx
import os
from collections import defaultdict

# Functions

In [None]:
def search_videos(query, max_results):
    videos = []
    next_page_token = None

    while len(videos) < max_results:
        remaining = max_results - len(videos)
        fetch_count = min(50, remaining)

        search_response = youtube.search().list(
            q=query,
            part='snippet',
            type='video',
            maxResults=fetch_count,
            pageToken=next_page_token
        ).execute()

        for item in search_response['items']:
            video_info = {
                'id': item['id']['videoId'],
                'title': item['snippet']['title'],
                'publishedAt': item['snippet']['publishedAt'],
                'channelId': item['snippet']['channelId'],
                'author': item['snippet']['channelTitle']
            }
            videos.append(video_info)

        next_page_token = search_response.get('nextPageToken')
        if not next_page_token:
            break

    return videos[:max_results]

In [None]:
def get_comments(video_id):
    comments = []

    def get_page(page_token=None):
        request = youtube.commentThreads().list(
            part='snippet,replies',
            videoId=video_id,
            maxResults=100,
            pageToken=page_token,
            textFormat='plainText'
        )
        return request.execute()

    response = get_page()
    while response:
        for item in response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            comment_data = {
                'commentId': item['id'],
                'text': top_comment['textDisplay'],
                'author': top_comment['authorDisplayName'],
                'publishedAt': top_comment['publishedAt'],
                'replies': []
            }

            if 'replies' in item:
                for reply in item['replies']['comments']:
                    reply_snippet = reply['snippet']
                    comment_data['replies'].append({
                        'commentId': reply['id'],
                        'text': reply_snippet['textDisplay'],
                        'author': reply_snippet['authorDisplayName'],
                        'publishedAt': reply_snippet['publishedAt'],
                    })

            comments.append(comment_data)

        if 'nextPageToken' in response:
            response = get_page(response['nextPageToken'])
        else:
            break

    return comments


In [None]:
def print_comment_tree(comments):
    for c in comments:
        print(f"{c['author']} said at {c['publishedAt']}: {c['text']}")
        for r in c['replies']:
            print(f"  ↳ {r['author']} replied at {r['publishedAt']}: {r['text']}")



In [None]:
def save_comment_tree_as_json(comments, filename='comment_tree.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(comments, f, ensure_ascii=False, indent=2)

In [None]:
def fetch_replies_recursively(parent_id):
    replies = []
    next_page_token = None

    while True:
        request = youtube.comments().list(
            part='snippet',
            parentId=parent_id,
            textFormat='plainText',
            maxResults=100,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response.get('items', []):
            reply_snippet = item['snippet']
            reply_data = {
                'commentId': item['id'],
                'text': reply_snippet['textDisplay'],
                'author': reply_snippet['authorDisplayName'],
                'publishedAt': reply_snippet['publishedAt'],
                'replies': fetch_replies_recursively(item['id'])  # 🧠 recursion here!
            }
            replies.append(reply_data)

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return replies


In [44]:
from googleapiclient.errors import HttpError

def get_full_comment_tree(video_id):
    comments = []
    next_page_token = None

    try:
      while True:
          request = youtube.commentThreads().list(
              part='snippet',
              videoId=video_id,
              maxResults=100,
              textFormat='plainText',
              pageToken=next_page_token
          )
          response = request.execute()

          for item in response.get('items', []):
              top_comment_snippet = item['snippet']['topLevelComment']['snippet']
              comment_data = {
                  'commentId': item['id'],
                  'text': top_comment_snippet['textDisplay'],
                  'author': top_comment_snippet['authorDisplayName'],
                  'publishedAt': top_comment_snippet['publishedAt'],
                  'replies': fetch_replies_recursively(item['id'])
              }
              comments.append(comment_data)

          next_page_token = response.get('nextPageToken')
          if not next_page_token:
              break

    except HttpError as e:
      error_reason = e.error_details[0].get("reason", "") if e.error_details else ""
      if error_reason == "commentsDisabled":
          print(f"Skipping video '{video_id}' — comments are disabled.")
      else:
          print(f"HTTP error occurred for video '{video_id}': {e}")
    return comments


In [None]:
def get_max_depth(comments): #Not necessary depth is fixed at 2 in Youtube
    if not comments:
        return 0
    return 1 + max((get_max_depth(comment.get('replies', [])) for comment in comments), default=0)


# API

In [61]:
API_KEY=''
youtube = build('youtube', 'v3', developerKey=API_KEY)

# Query

In [60]:
query='parking bonaire valencia'
os.makedirs(query, exist_ok=True)
results_number=100

In [None]:
videos = search_videos('parking bonaire valencia', max_results=results_number)
with open(query+'/videos_data.json', 'w', encoding='utf-8') as f:
    json.dump(videos, f, ensure_ascii=False, indent=4)

# Comment Trees

In [36]:
query='parking bonaire valencia'
results_number=100

with open(query+'/videos_data.json', 'r') as f:
    data = json.load(f)
video_ids = [video['id'] for video in data]

In [64]:
output_folder = os.path.join(query,'comment_trees')
os.makedirs(output_folder, exist_ok=True)
json_files = [f for f in os.listdir(output_folder) if f.endswith('.json')]
json_ids = {os.path.splitext(f)[0] for f in json_files}
remaining_ids = [id_ for id_ in video_ids if id_ not in json_ids]

print(len(video_ids),video_ids)
print(len(json_ids),json_ids)
print(len(remaining_ids), remaining_ids)

100 ['p2JaYHVnJiA', '6BMKx4sQ2o8', '3nrMNFtgKOo', 'LRtSPJi4Byk', 'u4Tnc5Z3u-Y', '5Ml8j7L_4c8', 'IpnKlUAzkoQ', 'dP-fovsQUYc', '-1xTSrm3p_0', 'OaseDDoc4ZQ', 'JRheS8fDrWM', 'dgSM5iDeHzU', 'bLx6AErC2nQ', 'nfVyFlqr3Go', 'SSZfvpx4grQ', 'ivnS8ZE3AO4', 'YnDCuawdEDM', 'Fy6GnmVzChM', 'zzIzMlbZu3s', 'sKRRt855AwI', 'aPouPRfDNA4', 'rYEJUMFt0BI', '1oz3ndF76WA', 'mOnaqqncMx4', '-tzCe4R5beg', 'UozH6vKj_GU', 'LS1jakQlCQ8', 'qk9IjnHodmM', 'Jo6ZOV8NS3g', 'CHBrNCP1BIs', 'MZDtnUxTbUI', 'NOXfLILDlB0', 'n5PM0OWhHNY', 'FwRsnP2BEdI', 'IByXtKzZusE', '7cEX45Mstak', 'gx-46d9NpNY', '0_o_p_GjFD0', 'eSalGRSXG4Q', 'OaKyLCJ75WU', 's21f1Q_kyA0', 'd7Ybk89ZKVo', 'J1NGIwkRApA', '9l7rUQtW-9Y', '574Q_dyRNZM', 'krYkQ9Z6A1Y', 'GkRmMcMk8U0', 'aUZBuQJ3pKQ', 'BaCW1W2cU5g', 'DWvjDi1eDNQ', 'aUZBuQJ3pKQ', '3eC9MIYGsbI', 'UC0fAr1OGHk', 'Y51xfO35ENk', 'GosHhphzLTo', 'SzuClHgS0Ow', '9bcJt3mRCIo', 'py4Y0ac-0ks', 'cbYVznlbWqk', 'SJ576J3uULc', 'ta1W5lScquI', 'jVQ9PY6nmKc', 'mLqFm4gIuAc', '8nWSEWgaGlA', '6psSXZ0vReA', 'QKauHor_2Jw', 'yXKb

In [70]:
for element in remaining_ids:
  os.makedirs(output_folder, exist_ok=True)
  comment_tree =get_full_comment_tree(element)
  if comment_tree:
    save_comment_tree_as_json(comment_tree, filename=output_folder+'/'+element+'.json')
    print(element)
  else:
    print('empty: '+element)



empty: zzIzMlbZu3s
empty: IByXtKzZusE
Skipping video 'gx-46d9NpNY' — comments are disabled.
empty: gx-46d9NpNY
empty: cbYVznlbWqk
empty: jVQ9PY6nmKc
empty: hn9IpLpbkTA
empty: sxSJ-sWWlBw
empty: 2esskDJ8E_o
empty: EXEDXMjE6-U
empty: Oi0G7Lk80xE
empty: xQv4BeP6kmY
empty: sVjQzk7UpgA
empty: IFZ2EZcLsEg
empty: fha4HWsflnM


In [51]:
# Path to your JSON files
directory = output_folder

for filename in os.listdir(directory):
    if filename.endswith('.json'):
        filepath = os.path.join(directory, filename)
        try:
            with open(filepath, 'r') as f:
                content = f.read().strip()
                if not content or json.loads(content) in ({}, []):
                    os.remove(filepath)
                    print(f"Deleted empty/blank JSON file: {filename}")
        except (json.JSONDecodeError, OSError):
            print(f"Skipped unreadable or malformed file: {filename}")

Deleted empty/blank JSON file: sVjQzk7UpgA.json
Deleted empty/blank JSON file: ricGhCxX69s.json
Deleted empty/blank JSON file: hn9IpLpbkTA.json
Deleted empty/blank JSON file: jVQ9PY6nmKc.json
Deleted empty/blank JSON file: zzIzMlbZu3s.json
Deleted empty/blank JSON file: kWHWO4NCWac.json
Deleted empty/blank JSON file: vSI2Frq5aoA.json
Deleted empty/blank JSON file: GBCNh5W229Y.json
Deleted empty/blank JSON file: 84NorZ4KFiw.json
Deleted empty/blank JSON file: iUcnGXnQS6s.json
Deleted empty/blank JSON file: IFZ2EZcLsEg.json
Deleted empty/blank JSON file: IqAPFfIXTVQ.json
Deleted empty/blank JSON file: sxSJ-sWWlBw.json
Deleted empty/blank JSON file: Bp6UDhqeEe4.json
Deleted empty/blank JSON file: Oi0G7Lk80xE.json
Deleted empty/blank JSON file: gx-46d9NpNY.json
Deleted empty/blank JSON file: T7D-LlLFpDg.json
Deleted empty/blank JSON file: fha4HWsflnM.json
Deleted empty/blank JSON file: PoNSUbTCXE4.json
Deleted empty/blank JSON file: 5OGUwBDItGc.json
Deleted empty/blank JSON file: An2n874v-

# Graphs

In [71]:
# Undirected Bipartite graph generation where all authors are connected to the
# videos they have commented or responded on
B = nx.Graph()

authors = set()
edges =set()
input_path=os.path.join(query,'comment_trees')

for filename in os.listdir(input_path):
  if filename.endswith('.json'):
    video_id=filename[-16:-5]
    file_path = os.path.join(input_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
      try:
        comments = json.load(file)
        for comment in comments:
          authors.add(comment['author'])
          edges.add((video_id,comment['author']))

          for reply in comment['replies']:
            authors.add(reply['author'])
            edges.add((video_id,reply['author']))
      except json.JSONDecodeError:
                    print(f"Error decoding JSON in file {filename}")

B.add_nodes_from(video_ids, bipartite=0)
B.add_nodes_from(authors, bipartite=1)

B.add_edges_from(edges)
nx.write_graphml(B, query+"/graph_1.graphml")

In [72]:
# Directed bipartite graph generation where all authors are connected to the
# videos they on and to the other users they have responded to with strength
# equal to the number of comments/responses to that particular video/author
G = nx.DiGraph()

authors = set()
video_ids = set()

edge_type1_counts = defaultdict(int)
edge_type2_counts = defaultdict(int)
edge_type3_counts = defaultdict(int)

input_path = os.path.join(query, 'comment_trees')

for filename in os.listdir(input_path):
    if filename.endswith('.json'):
        video_id = filename[-16:-5]
        video_ids.add(video_id)
        file_path = os.path.join(input_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                comments = json.load(file)
                for comment in comments:
                    author = comment['author']
                    authors.add(author)

                    edge_type1_counts[(author, video_id)] += 1

                    for reply in comment.get('replies', []):
                        replier = reply['author']
                        authors.add(replier)

                        edge_type2_counts[(replier, video_id)] += 1

                        edge_type3_counts[(replier, author)] += 1
            except json.JSONDecodeError:
                print(f"Error decoding JSON in file {filename}")

# Add nodes
G.add_nodes_from(video_ids, type='video')
G.add_nodes_from(authors, type='author')

# Add Type 1 edges (non-directed, but we add them as edges with attribute 'type')
for (author, video), count in edge_type1_counts.items():
    G.add_edge(author, video, type='Comment', strength=count)

# Add Type 2 edges (directed)
for (replier, commenter), count in edge_type2_counts.items():
    G.add_edge(replier, commenter, type='Reply', strength=count)


# Add Type 3 edges (directed)
#for (replier, video), count in edge_type3_counts.items():
#    G.add_edge(replier, video, type='type3', strength=count)

nx.write_graphml(G, query+"/graph_2.graphml")

# Download

In [73]:
#Compress and download query
import shutil
from google.colab import files

shutil.make_archive(query, 'zip', query)
files.download(query+'.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>