In [73]:
!pip install google-api-python-client
!pip install networkx



In [72]:
from googleapiclient.discovery import build
import json
import networkx as nx
import os

In [None]:
def search_videos(query, max_results=5):
    request = youtube.search().list(
        q=query,
        part='snippet',
        type='video',
        maxResults=max_results
    )
    response = request.execute()
    videos = [
        {'videoId': item['id']['videoId'], 'title': item['snippet']['title']}
        for item in response['items']
    ]
    return videos


In [None]:
def get_comments(video_id):
    comments = []

    def get_page(page_token=None):
        request = youtube.commentThreads().list(
            part='snippet,replies',
            videoId=video_id,
            maxResults=100,
            pageToken=page_token,
            textFormat='plainText'
        )
        return request.execute()

    response = get_page()
    while response:
        for item in response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            comment_data = {
                'commentId': item['id'],
                'text': top_comment['textDisplay'],
                'author': top_comment['authorDisplayName'],
                'publishedAt': top_comment['publishedAt'],
                'replies': []
            }

            if 'replies' in item:
                for reply in item['replies']['comments']:
                    reply_snippet = reply['snippet']
                    comment_data['replies'].append({
                        'commentId': reply['id'],
                        'text': reply_snippet['textDisplay'],
                        'author': reply_snippet['authorDisplayName'],
                        'publishedAt': reply_snippet['publishedAt'],
                    })

            comments.append(comment_data)

        if 'nextPageToken' in response:
            response = get_page(response['nextPageToken'])
        else:
            break

    return comments


In [None]:
def print_comment_tree(comments):
    for c in comments:
        print(f"{c['author']} said at {c['publishedAt']}: {c['text']}")
        for r in c['replies']:
            print(f"  ↳ {r['author']} replied at {r['publishedAt']}: {r['text']}")



In [None]:
def save_comment_tree_as_json(comments, filename='comment_tree.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(comments, f, ensure_ascii=False, indent=2)

In [None]:
def fetch_replies_recursively(parent_id):
    replies = []
    next_page_token = None

    while True:
        request = youtube.comments().list(
            part='snippet',
            parentId=parent_id,
            textFormat='plainText',
            maxResults=100,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response.get('items', []):
            reply_snippet = item['snippet']
            reply_data = {
                'commentId': item['id'],
                'text': reply_snippet['textDisplay'],
                'author': reply_snippet['authorDisplayName'],
                'publishedAt': reply_snippet['publishedAt'],
                'replies': fetch_replies_recursively(item['id'])  # 🧠 recursion here!
            }
            replies.append(reply_data)

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return replies


In [None]:
def get_full_comment_tree(video_id):
    comments = []
    next_page_token = None

    while True:
        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=100,
            textFormat='plainText',
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response.get('items', []):
            top_comment_snippet = item['snippet']['topLevelComment']['snippet']
            comment_data = {
                'commentId': item['id'],
                'text': top_comment_snippet['textDisplay'],
                'author': top_comment_snippet['authorDisplayName'],
                'publishedAt': top_comment_snippet['publishedAt'],
                'replies': fetch_replies_recursively(item['id'])
            }
            comments.append(comment_data)

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return comments


In [71]:
def get_max_depth(comments): #Not necessary depth is fixed at 2 in Youtube
    if not comments:
        return 0
    return 1 + max((get_max_depth(comment.get('replies', [])) for comment in comments), default=0)


In [74]:
API_KEY='AIzaSyAqceVKcmEXPp0ZEWZMfHsILgXMPMRDSW8'
query='parking bonaire valencia'
results_number=3

In [85]:
youtube = build('youtube', 'v3', developerKey=API_KEY)
videos = search_videos('parking bonaire valencia', max_results=results_number)
video_ids = [video['videoId'] for video in videos]

In [96]:
os.makedirs(query, exist_ok=True)
for element in video_ids:
  print(element)
  output_folder = os.path.join(query,'comment_trees')
  os.makedirs(output_folder, exist_ok=True)
  comment_tree =get_full_comment_tree(element)
  save_comment_tree_as_json(comment_tree, filename=output_folder+'/'+element+'.json')

p2JaYHVnJiA
6BMKx4sQ2o8
dgSM5iDeHzU


In [108]:
B = nx.Graph()

authors = set()
edges =set()
input_path=os.path.join(query,'comment_trees')

for filename in os.listdir(input_path):
  if filename.endswith('.json'):
    video_id=filename[-16:-5]
    file_path = os.path.join(input_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
      try:
        comments = json.load(file)
        for comment in comments:
          authors.add(comment['author'])
          edges.add((video_id,comment['author']))

          for reply in comment['replies']:
            authors.add(reply['author'])
            edges.add((video_id,reply['author']))
      except json.JSONDecodeError:
                    print(f"Error decoding JSON in file {filename}")

B.add_nodes_from(video_ids, bipartite=0)
B.add_nodes_from(authors, bipartite=1)

B.add_edges_from(edges)
nx.write_graphml(B, query+"/graph_1.graphml")

In [106]:
print(B.edges())

[('p2JaYHVnJiA', '@jorgemanuelcasanova133'), ('p2JaYHVnJiA', '@marianonunez1579'), ('p2JaYHVnJiA', '@PandolfoCarla'), ('p2JaYHVnJiA', '@juansanmartinfierro7813'), ('p2JaYHVnJiA', '@honneybearr'), ('p2JaYHVnJiA', '@paje7912'), ('p2JaYHVnJiA', '@alexandu7425'), ('p2JaYHVnJiA', '@YsbethPerez-xn4uy'), ('p2JaYHVnJiA', '@danielgarrido2012'), ('p2JaYHVnJiA', '@vicentpiquersanchis5186'), ('p2JaYHVnJiA', '@SilviadeDamianelpalas'), ('p2JaYHVnJiA', '@montsebarberacasajuana3623'), ('p2JaYHVnJiA', '@margaritasolis2170'), ('p2JaYHVnJiA', '@petrisraz1551'), ('p2JaYHVnJiA', '@Sonia-e3i6h'), ('p2JaYHVnJiA', '@ceciliamedinagarcia5425'), ('p2JaYHVnJiA', '@chusrodriguez1018'), ('p2JaYHVnJiA', '@artistrash'), ('p2JaYHVnJiA', '@pelusin2335'), ('p2JaYHVnJiA', '@mejoresmomentostwich'), ('p2JaYHVnJiA', '@mizukowa'), ('p2JaYHVnJiA', '@mamapipi4493'), ('p2JaYHVnJiA', '@RamonCervo'), ('p2JaYHVnJiA', '@jennarentz'), ('p2JaYHVnJiA', '@Mkwd-'), ('p2JaYHVnJiA', '@piruskaperez9963'), ('p2JaYHVnJiA', '@vasily92884'), (