# Packages

In [1]:
!pip install google-api-python-client
!pip install networkx



In [2]:
from googleapiclient.discovery import build
import json
import networkx as nx
import os
from collections import defaultdict

# Functions

In [3]:
def search_videos(query, max_results=5):
    request = youtube.search().list(
        q=query,
        part='snippet',
        type='video',
        maxResults=max_results
    )
    response = request.execute()
    videos = [
        {'videoId': item['id']['videoId'], 'title': item['snippet']['title']}
        for item in response['items']
    ]
    return videos


In [4]:
def get_comments(video_id):
    comments = []

    def get_page(page_token=None):
        request = youtube.commentThreads().list(
            part='snippet,replies',
            videoId=video_id,
            maxResults=100,
            pageToken=page_token,
            textFormat='plainText'
        )
        return request.execute()

    response = get_page()
    while response:
        for item in response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            comment_data = {
                'commentId': item['id'],
                'text': top_comment['textDisplay'],
                'author': top_comment['authorDisplayName'],
                'publishedAt': top_comment['publishedAt'],
                'replies': []
            }

            if 'replies' in item:
                for reply in item['replies']['comments']:
                    reply_snippet = reply['snippet']
                    comment_data['replies'].append({
                        'commentId': reply['id'],
                        'text': reply_snippet['textDisplay'],
                        'author': reply_snippet['authorDisplayName'],
                        'publishedAt': reply_snippet['publishedAt'],
                    })

            comments.append(comment_data)

        if 'nextPageToken' in response:
            response = get_page(response['nextPageToken'])
        else:
            break

    return comments


In [5]:
def print_comment_tree(comments):
    for c in comments:
        print(f"{c['author']} said at {c['publishedAt']}: {c['text']}")
        for r in c['replies']:
            print(f"  ↳ {r['author']} replied at {r['publishedAt']}: {r['text']}")



In [6]:
def save_comment_tree_as_json(comments, filename='comment_tree.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(comments, f, ensure_ascii=False, indent=2)

In [7]:
def fetch_replies_recursively(parent_id):
    replies = []
    next_page_token = None

    while True:
        request = youtube.comments().list(
            part='snippet',
            parentId=parent_id,
            textFormat='plainText',
            maxResults=100,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response.get('items', []):
            reply_snippet = item['snippet']
            reply_data = {
                'commentId': item['id'],
                'text': reply_snippet['textDisplay'],
                'author': reply_snippet['authorDisplayName'],
                'publishedAt': reply_snippet['publishedAt'],
                'replies': fetch_replies_recursively(item['id'])  # 🧠 recursion here!
            }
            replies.append(reply_data)

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return replies


In [8]:
def get_full_comment_tree(video_id):
    comments = []
    next_page_token = None

    while True:
        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=100,
            textFormat='plainText',
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response.get('items', []):
            top_comment_snippet = item['snippet']['topLevelComment']['snippet']
            comment_data = {
                'commentId': item['id'],
                'text': top_comment_snippet['textDisplay'],
                'author': top_comment_snippet['authorDisplayName'],
                'publishedAt': top_comment_snippet['publishedAt'],
                'replies': fetch_replies_recursively(item['id'])
            }
            comments.append(comment_data)

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return comments


In [9]:
def get_max_depth(comments): #Not necessary depth is fixed at 2 in Youtube
    if not comments:
        return 0
    return 1 + max((get_max_depth(comment.get('replies', [])) for comment in comments), default=0)


# Query

In [19]:
API_KEY=query='parking bonaire valencia'
results_number=100

In [20]:
youtube = build('youtube', 'v3', developerKey=API_KEY)
videos = search_videos('parking bonaire valencia', max_results=results_number)
video_ids = [video['videoId'] for video in videos]

# Comment Trees

In [21]:
os.makedirs(query, exist_ok=True)
for element in video_ids:
  print(element)
  output_folder = os.path.join(query,'comment_trees')
  os.makedirs(output_folder, exist_ok=True)
  comment_tree =get_full_comment_tree(element)
  save_comment_tree_as_json(comment_tree, filename=output_folder+'/'+element+'.json')

p2JaYHVnJiA
6BMKx4sQ2o8
dgSM5iDeHzU
IpnKlUAzkoQ
3nrMNFtgKOo
LRtSPJi4Byk
dP-fovsQUYc
-1xTSrm3p_0
5Ml8j7L_4c8
OaseDDoc4ZQ
u4Tnc5Z3u-Y
JRheS8fDrWM
ivnS8ZE3AO4
sKRRt855AwI
Fy6GnmVzChM
aPouPRfDNA4
YnDCuawdEDM
bLx6AErC2nQ
SSZfvpx4grQ
-tzCe4R5beg
KnYxJtHgP5I
nfVyFlqr3Go
0_o_p_GjFD0
AUerKkWslP8
zzIzMlbZu3s
eSalGRSXG4Q
CHBrNCP1BIs




HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/comments?part=snippet&parentId=UgzOWM9eVWnoobjgbaZ4AaABAg.AARX_4CJ2xMAARkGEhu9Ch&textFormat=plainText&maxResults=100&key=AIzaSyAqceVKcmEXPp0ZEWZMfHsILgXMPMRDSW8&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">

# Graphs

In [23]:
# Undirected Bipartite graph generation where all authors are connected to the
# videos they have commented or responded on
B = nx.Graph()

authors = set()
edges =set()
input_path=os.path.join(query,'comment_trees')

for filename in os.listdir(input_path):
  if filename.endswith('.json'):
    video_id=filename[-16:-5]
    file_path = os.path.join(input_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
      try:
        comments = json.load(file)
        for comment in comments:
          authors.add(comment['author'])
          edges.add((video_id,comment['author']))

          for reply in comment['replies']:
            authors.add(reply['author'])
            edges.add((video_id,reply['author']))
      except json.JSONDecodeError:
                    print(f"Error decoding JSON in file {filename}")

B.add_nodes_from(video_ids, bipartite=0)
B.add_nodes_from(authors, bipartite=1)

B.add_edges_from(edges)
nx.write_graphml(B, query+"/graph_1.graphml")

In [27]:
# Directed bipartite graph generation where all authors are connected to the
# videos they on and to the other users they have responded to with strength
# equal to the number of comments/responses to that particular video/author
G = nx.DiGraph()

authors = set()
video_ids = set()

edge_type1_counts = defaultdict(int)
edge_type2_counts = defaultdict(int)
edge_type3_counts = defaultdict(int)

input_path = os.path.join(query, 'comment_trees')

for filename in os.listdir(input_path):
    if filename.endswith('.json'):
        video_id = filename[-16:-5]
        video_ids.add(video_id)
        file_path = os.path.join(input_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                comments = json.load(file)
                for comment in comments:
                    author = comment['author']
                    authors.add(author)

                    edge_type1_counts[(author, video_id)] += 1

                    for reply in comment.get('replies', []):
                        replier = reply['author']
                        authors.add(replier)

                        edge_type2_counts[(replier, video_id)] += 1

                        edge_type3_counts[(replier, author)] += 1
            except json.JSONDecodeError:
                print(f"Error decoding JSON in file {filename}")

# Add nodes
G.add_nodes_from(video_ids, type='video')
G.add_nodes_from(authors, type='author')

# Add Type 1 edges (non-directed, but we add them as edges with attribute 'type')
for (author, video), count in edge_type1_counts.items():
    G.add_edge(author, video, type='Comment', strength=count)

# Add Type 2 edges (directed)
for (replier, commenter), count in edge_type2_counts.items():
    G.add_edge(replier, commenter, type='Reply', strength=count)


# Add Type 3 edges (directed)
#for (replier, video), count in edge_type3_counts.items():
#    G.add_edge(replier, video, type='type3', strength=count)

nx.write_graphml(G, query+"/graph_2.graphml")

# Download

In [29]:
#Compress and download query
import shutil
from google.colab import files

shutil.make_archive(query, 'zip', query)
files.download(query+'.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>