<a href="https://colab.research.google.com/github/ahmedshahriar/youtube-comment-scraper/blob/main/Youtube_comment_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Comments from Youtube videos

GitHub Repo : [youtube-comment-scraper](https://github.com/ahmedshahriar/youtube-comment-scraper)

In [7]:
# !pip install lxml
# The script is based on https://github.com/egbertbouman/youtube-comment-downloader

import pandas as pd
import json
import os
import sys
import re
import time

import lxml.html
import requests


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_service_ajax'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

SORT_BY_POPULAR = 0
SORT_BY_RECENT = 1

YT_CFG_RE = r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;'
YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;\s*(?:var\s+meta|</script|\n)'

FILE_NAME = 'ytb_comments.csv'

def regex_search(text, pattern, group=1, default=None):
    match = re.search(pattern, text)
    return match.group(group) if match else default


def ajax_request(session, endpoint, ytcfg, retries=5, sleep=20):
    url = 'https://www.youtube.com' + endpoint['commandMetadata']['webCommandMetadata']['apiUrl']
    
    data = {'context': ytcfg['INNERTUBE_CONTEXT'],
            'continuation': endpoint['continuationCommand']['token']}

    for _ in range(retries):
        response = session.post(url, params={'key': ytcfg['INNERTUBE_API_KEY']}, json=data)
        if response.status_code == 200:
            return response.json()
        if response.status_code in [403, 413]:
            return {}
        else:
            time.sleep(sleep)


def download_comments(YOUTUBE_VIDEO_URL, sort_by=SORT_BY_RECENT, language=None, sleep=.1):
    session = requests.Session()
    session.headers['User-Agent'] = USER_AGENT

    # response = session.get(YOUTUBE_VIDEO_URL.format(youtube_id=youtube_id))
    response = session.get(YOUTUBE_VIDEO_URL)

    if 'uxe=' in response.request.url:
        session.cookies.set('CONSENT', 'YES+cb', domain='.youtube.com')
        response = session.get(YOUTUBE_VIDEO_URL.format(youtube_id=youtube_id))

    html = response.text
    ytcfg = json.loads(regex_search(html, YT_CFG_RE, default=''))
    if not ytcfg:
        return # Unable to extract configuration
    if language:
        ytcfg['INNERTUBE_CONTEXT']['client']['hl'] = language

    data = json.loads(regex_search(html, YT_INITIAL_DATA_RE, default=''))

    section = next(search_dict(data, 'itemSectionRenderer'), None)
    renderer = next(search_dict(section, 'continuationItemRenderer'), None) if section else None
    if not renderer:
        # Comments disabled?
        return

    needs_sorting = sort_by != SORT_BY_POPULAR
    continuations = [renderer['continuationEndpoint']]
    while continuations:
        continuation = continuations.pop()
        response = ajax_request(session, continuation, ytcfg)

        if not response:
            break
        if list(search_dict(response, 'externalErrorMessage')):
            raise RuntimeError('Error returned from server: ' + next(search_dict(response, 'externalErrorMessage')))

        if needs_sorting:
            sort_menu = next(search_dict(response, 'sortFilterSubMenuRenderer'), {}).get('subMenuItems', [])
            if sort_by < len(sort_menu):
                continuations = [sort_menu[sort_by]['serviceEndpoint']]
                needs_sorting = False
                continue
            raise RuntimeError('Failed to set sorting')

        actions = list(search_dict(response, 'reloadContinuationItemsCommand')) + \
                  list(search_dict(response, 'appendContinuationItemsAction'))
        for action in actions:
            for item in action.get('continuationItems', []):
                if action['targetId'] == 'comments-section':
                    # Process continuations for comments and replies.
                    continuations[:0] = [ep for ep in search_dict(item, 'continuationEndpoint')]
                if action['targetId'].startswith('comment-replies-item') and 'continuationItemRenderer' in item:
                    # Process the 'Show more replies' button
                    continuations.append(next(search_dict(item, 'buttonRenderer'))['command'])

        for comment in reversed(list(search_dict(response, 'commentRenderer'))):
            yield {'cid': comment['commentId'],
                   'text': ''.join([c['text'] for c in comment['contentText'].get('runs', [])]),
                   'time': comment['publishedTimeText']['runs'][0]['text'],
                   'author': comment.get('authorText', {}).get('simpleText', ''),
                   'channel': comment['authorEndpoint']['browseEndpoint'].get('browseId', ''),
                   'votes': comment.get('voteCount', {}).get('simpleText', '0'),
                   'photo': comment['authorThumbnail']['thumbnails'][-1]['url'],
                   'heart': next(search_dict(comment, 'isHearted'), False)}

        time.sleep(sleep)


def search_dict(partial, search_key):
    stack = [partial]
    while stack:
        current_item = stack.pop()
        if isinstance(current_item, dict):
            for key, value in current_item.items():
                if key == search_key:
                    yield value
                else:
                    stack.append(value)
        elif isinstance(current_item, list):
            for value in current_item:
                stack.append(value)


def main(url):
    df_comment = pd.DataFrame()
    try:
        youtube_url = url
        limit = 100

        print('Downloading Youtube comments for video:', youtube_url)

        count = 0

        start_time = time.time()

        for comment in download_comments(youtube_url):

            df_comment = df_comment.append(comment, ignore_index=True)

            # comments overview
            comment_json = json.dumps(comment, ensure_ascii=False)
            print(comment_json)

            count += 1

            if limit and count >= limit:
                break

        print(df_comment.shape, df_comment)

        if not os.path.isfile(FILE_NAME):
            df_comment.to_csv(FILE_NAME, encoding='utf-8', index=False)
        else:  # else it exists so append without writing the header
            df_comment.to_csv(FILE_NAME, mode='a', encoding='utf-8', index=False, header=False)

        print('\n[{:.2f} seconds] Done!'.format(time.time() - start_time))

    except Exception as e:
        print('Error:', str(e))
        sys.exit(1)                            



"""Dump to a csv  from a single video"""
# youtube_URL = 'https://www.youtube.com/watch?v=Ucbrmw2qtXs'
# main(youtube_URL)

"""
Dump to a csv from a a csv with video links
NB create a csv with one column titled 'link'
a sample is given below

'ytb_video_list.csv'

link
https://www.youtube.com/watch?v=-t_uhBBDbA4
https://www.youtube.com/watch?v=75vjjRza7IU
https://www.youtube.com/watch?v=j6dmaPzOBHY
https://www.youtube.com/watch?v=Yj2efyQV1RI
https://www.youtube.com/watch?v=HV652F7U6Qs
https://www.youtube.com/watch?v=47iXEucg3eo
https://www.youtube.com/watch?v=ofHXBLEE3TQ
https://www.youtube.com/watch?v=X6lGqSfVRT8
https://www.youtube.com/watch?v=a_-z9FhGBrE
https://www.youtube.com/watch?v=wTUM_4cVlE4


"""
# df_video_list = pd.read_csv('ytb_video_list.csv')
# print(df_video_list['link'].map(lambda x: main(x)))
# print(main(pd.read_csv('ytb_video_list.csv')['link']))


"""Dump to a csv from a a list with video links"""
ytb_video_list = ['https://www.youtube.com/watch?v=-t_uhBBDbA4',
                  'https://www.youtube.com/watch?v=75vjjRza7IU',
                  'https://www.youtube.com/watch?v=j6dmaPzOBHY']

for video_link in ytb_video_list:
    main(video_link)



Downloading Youtube comments for video: https://www.youtube.com/watch?v=-t_uhBBDbA4
{"cid": "UgxzyhukU_rsavDGIMd4AaABAg", "text": "দেশের বিমানবহরে যুক্ত হলো ৩য় ড্রিমলাইনার\r\nবিস্তারিত দেখুন ভিডিওতে...  https://youtu.be/EXhYI6jt3TQ", "time": "2 years ago", "author": "SOMOY TV", "channel": "UCxHoBXkY88Tb8z1Ssj6CWsQ", "votes": "195", "photo": "https://yt3.ggpht.com/ytc/AKedOLQoNgiIcd_JaIA7T4QFZI_O7evCZYQLtiTNAExKlg=s176-c-k-c0x00ffffff-no-rj", "heart": false}
{"cid": "Ugx8-sVgxCfShBuijjp4AaABAg", "text": "Hummmmm Hasina thik bolen i salut ✊✊", "time": "2 days ago", "author": "Towhidul Islam samir", "channel": "UCfihxkmCUK-1QRRji_HRMAw", "votes": "0", "photo": "https://yt3.ggpht.com/ytc/AKedOLQmS_bc4_wfrE9eMkpMh_AmijixiMYkcaUNriKGWjzPZJ0Dls4hjM0A2aG0_YM3=s176-c-k-c0x00ffffff-no-rj", "heart": false}
{"cid": "Ugz8Zy9tJpT3W2QnuZB4AaABAg", "text": "🐕🐕🐕🐕🐕🐕", "time": "3 days ago", "author": "MD MAMA", "channel": "UCtCmJ9vendVGq8G6T3tBhpQ", "votes": "0", "photo": "https://yt3.ggpht.com/V4qWsmYG

In [8]:
pd.read_csv('/content/ytb_comments.csv')

Unnamed: 0,SOMOY TV,UCxHoBXkY88Tb8z1Ssj6CWsQ,UgxzyhukU_rsavDGIMd4AaABAg,0.0,https://yt3.ggpht.com/ytc/AKedOLQoNgiIcd_JaIA7T4QFZI_O7evCZYQLtiTNAExKlg=s176-c-k-c0x00ffffff-no-rj,দেশের বিমানবহরে যুক্ত হলো ৩য় ড্রিমলাইনার\r\nবিস্তারিত দেখুন ভিডিওতে... https://youtu.be/EXhYI6jt3TQ,2 years ago,195
0,Towhidul Islam samir,UCfihxkmCUK-1QRRji_HRMAw,Ugx8-sVgxCfShBuijjp4AaABAg,0.0,https://yt3.ggpht.com/ytc/AKedOLQmS_bc4_wfrE9e...,Hummmmm Hasina thik bolen i salut ✊✊,2 days ago,0
1,MD MAMA,UCtCmJ9vendVGq8G6T3tBhpQ,Ugz8Zy9tJpT3W2QnuZB4AaABAg,0.0,https://yt3.ggpht.com/V4qWsmYGXd294V3HuF2WcX9_...,🐕🐕🐕🐕🐕🐕,3 days ago,0
2,Arafat Islam,UCo7m89N_oH_h4gt1y3eQ8nA,UgxKRPR5xs6hQLDwFtZ4AaABAg,0.0,https://yt3.ggpht.com/kiw8iRF-2McyhWWsrI5Vtdd6...,মাশাল্লা মাশাল্লা আলহামদুলিল্লাহ সুবহানাল্লাহ ...,3 days ago,1
3,Saleh Ahmed,UCt2GFONYqgmilBnqzzjLweg,UgxmzQOAWO7auk_PbP14AaABAg,0.0,https://yt3.ggpht.com/ytc/AKedOLSdu3LRQc4Txk-f...,Thanks,4 days ago,0
4,Mritunjoy sikder,UCMcz7ePgpE1PxZimzTMOfmw,UgyukkqTnj4ysATtWTt4AaABAg,0.0,https://yt3.ggpht.com/ytc/AKedOLQhKAjuXP624oQ7...,বাংলাদেশ তো কাঙাল দেশ।,4 days ago,0
...,...,...,...,...,...,...,...,...
594,Murad. Italy,UCzDnlnkWlNwo8O82byZXmkA,UgyZ_cUQ22v3xCP1AmZ4AaABAg,0.0,https://yt3.ggpht.com/ytc/AKedOLSu-A4a4uyieWuu...,হরঘত মুহাম্মদ সঃ কি ভালো মানুষ ছিলেন জৌবনে ল...,4 months ago,0
595,Md Ahajab,UCr4vVWO5ghBPCLFbenr9XOA,UgwQQs0J-hrT094ds5R4AaABAg,0.0,https://yt3.ggpht.com/ytc/AKedOLSejCV9CY-dyGqb...,মৃত্যুর বদলে মৃত্যু সমস্ত পাবলিকের সামনে তার গ...,4 months ago,4
596,fokhrul islam,UCWzBxrz7UeRipdyff1p7-0w,Ugz6QH2F_ssizsk9WH94AaABAg,0.0,https://yt3.ggpht.com/ytc/AKedOLTyPame5KL0lxv7...,আল্লাহ যেন এই ছেলে মেয়ে গুলোরে হেপাজতে রাখে এই...,4 months ago,5
597,The Blue River,UCKL5Umt9X2PjXMFRSlEzzew,Ugw_E5Ls0mSh1XQHkA94AaABAg,0.0,https://yt3.ggpht.com/ytc/AKedOLQi97Gte0EcTWYw...,এভাবে কারো মৃত্যু কামনা করি না । আমিও এর দৃষ্ট...,4 months ago,0
