<a href="https://colab.research.google.com/github/ahmedshahriar/youtube-comment-scraper/blob/main/Youtube_comment_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Comments from Youtube videos

GitHub Repo : [youtube-comment-scraper](https://github.com/ahmedshahriar/youtube-comment-scraper)

In [6]:
!pip install lxml
# The script is based on https://github.com/egbertbouman/youtube-comment-downloader

import pandas as pd
import json
import os
import sys
import time

import lxml.html
import requests


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_service_ajax'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

SORT_BY_POPULAR = 0
SORT_BY_RECENT = 1

FILE_NAME = 'ytb_comments.csv'

def find_value(html, key, num_chars=2, separator='"'):
    pos_begin = html.find(key) + len(key) + num_chars
    pos_end = html.find(separator, pos_begin)
    return html[pos_begin: pos_end]


def ajax_request(session, url, params=None, data=None, headers=None, retries=5, sleep=20):
    for _ in range(retries):
        response = session.post(url, params=params, data=data, headers=headers)
        if response.status_code == 200:
            return response.json()
        if response.status_code in [403, 413]:
            return {}
        else:
            time.sleep(sleep)


def download_comments(youtube_video_url, sort_by=SORT_BY_RECENT, sleep=.1):
    session = requests.Session()
    session.headers['User-Agent'] = USER_AGENT

    response = session.get(youtube_video_url)
    html = response.text
    session_token = find_value(html, 'XSRF_TOKEN', 3)
    session_token = session_token.encode('ascii').decode('unicode-escape')

    data = json.loads(find_value(html, 'var ytInitialData = ', 0, '};') + '}')
    for renderer in search_dict(data, 'itemSectionRenderer'):
        ncd = next(search_dict(renderer, 'nextContinuationData'), None)
        if ncd:
            break

    if not ncd:
        # Comments disabled?
        return

    needs_sorting = sort_by != SORT_BY_POPULAR
    continuations = [(ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comments')]
    while continuations:
        continuation, itct, action = continuations.pop()
        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL,
                                params={action: 1,
                                        'pbj': 1,
                                        'ctoken': continuation,
                                        'continuation': continuation,
                                        'itct': itct},
                                data={'session_token': session_token},
                                headers={'X-YouTube-Client-Name': '1',
                                         'X-YouTube-Client-Version': '2.20201202.06.01'})

        if not response:
            break
        if list(search_dict(response, 'externalErrorMessage')):
            raise RuntimeError('Error returned from server: ' + next(search_dict(response, 'externalErrorMessage')))

        if needs_sorting:
            sort_menu = next(search_dict(response, 'sortFilterSubMenuRenderer'), {}).get('subMenuItems', [])
            if sort_by < len(sort_menu):
                ncd = sort_menu[sort_by]['continuation']['reloadContinuationData']
                continuations = [(ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comments')]
                needs_sorting = False
                continue
            raise RuntimeError('Failed to set sorting')

        if action == 'action_get_comments':
            section = next(search_dict(response, 'itemSectionContinuation'), {})
            for continuation in section.get('continuations', []):
                ncd = continuation['nextContinuationData']
                continuations.append((ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comments'))
            for item in section.get('contents', []):
                continuations.extend([(ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comment_replies')
                                      for ncd in search_dict(item, 'nextContinuationData')])

        elif action == 'action_get_comment_replies':
            continuations.extend([(ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comment_replies')
                                  for ncd in search_dict(response, 'nextContinuationData')])

        for comment in search_dict(response, 'commentRenderer'):
            yield {'cid': comment['commentId'],
                   'text': ''.join([c['text'] for c in comment['contentText']['runs']]),
                   'time': comment['publishedTimeText']['runs'][0]['text'],
                   'author': comment.get('authorText', {}).get('simpleText', ''),
                   'channel': comment['authorEndpoint']['browseEndpoint']['browseId'],
                   'votes': comment.get('voteCount', {}).get('simpleText', '0'),
                   'photo': comment['authorThumbnail']['thumbnails'][-1]['url'],
                   'heart': next(search_dict(comment, 'isHearted'), False)}

        time.sleep(sleep)


def search_dict(partial, search_key):
    stack = [partial]
    while stack:
        current_item = stack.pop()
        if isinstance(current_item, dict):
            for key, value in current_item.items():
                if key == search_key:
                    yield value
                else:
                    stack.append(value)
        elif isinstance(current_item, list):
            for value in current_item:
                stack.append(value)


def main(url):
    df_comment = pd.DataFrame()
    try:
        youtube_url = url
        limit = 100

        print('Downloading Youtube comments for video:', youtube_url)

        count = 0

        start_time = time.time()

        for comment in download_comments(youtube_url):

            df_comment = df_comment.append(comment, ignore_index=True)

            # comments overview
            comment_json = json.dumps(comment, ensure_ascii=False)
            print(comment_json)

            count += 1

            if limit and count >= limit:
                break

        print(df_comment.shape, df_comment)

        if not os.path.isfile(FILE_NAME):
            df_comment.to_csv(FILE_NAME, encoding='utf-8', index=False)
        else:  # else it exists so append without writing the header
            df_comment.to_csv(FILE_NAME, mode='a', encoding='utf-8', index=False, header=False)

        print('\n[{:.2f} seconds] Done!'.format(time.time() - start_time))

    except Exception as e:
        print('Error:', str(e))
        sys.exit(1)                            



"""Dump to a csv  from a single video"""
# youtube_URL = 'https://www.youtube.com/watch?v=Ucbrmw2qtXs'
# main(youtube_URL)

"""
Dump to a csv from a a csv with video links
NB create a csv with one column titled 'link'
a sample is given below

'ytb_video_list.csv'

link
https://www.youtube.com/watch?v=-t_uhBBDbA4
https://www.youtube.com/watch?v=75vjjRza7IU
https://www.youtube.com/watch?v=j6dmaPzOBHY
https://www.youtube.com/watch?v=Yj2efyQV1RI
https://www.youtube.com/watch?v=HV652F7U6Qs
https://www.youtube.com/watch?v=47iXEucg3eo
https://www.youtube.com/watch?v=ofHXBLEE3TQ
https://www.youtube.com/watch?v=X6lGqSfVRT8
https://www.youtube.com/watch?v=a_-z9FhGBrE
https://www.youtube.com/watch?v=wTUM_4cVlE4


"""
# df_video_list = pd.read_csv('ytb_video_list.csv')
# print(df_video_list['link'].map(lambda x: main(x)))
# print(main(pd.read_csv('ytb_video_list.csv')['link']))


"""Dump to a csv from a a list with video links"""
ytb_video_list = ['https://www.youtube.com/watch?v=-t_uhBBDbA4',
                  'https://www.youtube.com/watch?v=75vjjRza7IU',
                  'https://www.youtube.com/watch?v=j6dmaPzOBHY']

for video_link in ytb_video_list:
    main(video_link)



Downloading Youtube comments for video: https://www.youtube.com/watch?v=-t_uhBBDbA4
{"cid": "UgydPjgooSE6rLYgbY14AaABAg", "text": "জয় বাংলা", "time": "3 weeks ago", "author": "rony98 Gplaypoint", "channel": "UCQ8qCyqI9GZrBIqejvH9bZQ", "votes": "0", "photo": "https://yt3.ggpht.com/ytc/AAUvwnhcqYhzT2fHru-siwKzYkMUfSy2MxRQij_5MLWU=s176-c-k-c0x00ffffff-no-rj", "heart": false}
{"cid": "UgwHNIIqu9qeXFmlnax4AaABAg", "text": "জয়তু শেখ হাসিনা", "time": "3 weeks ago", "author": "Shamol Singho", "channel": "UC8Hl6YWHnVvC3aluFzgu_IQ", "votes": "0", "photo": "https://yt3.ggpht.com/ytc/AAUvwnjfRjST3R4IK7KlMldlDaV2o9E9fZDgDm1j=s176-c-k-c0x00ffffff-no-rj", "heart": false}
{"cid": "UgyrpSuabnF-7oHKaSB4AaABAg", "text": "Cricket six gol 😜😜😜", "time": "2 weeks ago", "author": "Anoop Anu", "channel": "UC2IkDAAJKTeO3ClaNuQHXHQ", "votes": "0", "photo": "https://yt3.ggpht.com/ytc/AAUvwnhvhxL9bhLJd3z7he6t0KJJRYLFhzsUXREefQ=s176-c-k-c0x00ffffff-no-rj", "heart": false}
{"cid": "Ugw6F7nk9z7DqiY1AWl4AaABAg", "text

In [7]:
pd.read_csv('/content/ytb_comments.csv')

Unnamed: 0,author,channel,cid,heart,photo,text,time,votes
0,rony98 Gplaypoint,UCQ8qCyqI9GZrBIqejvH9bZQ,UgydPjgooSE6rLYgbY14AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwnhcqYhzT2fHru-s...,জয় বাংলা,3 weeks ago,0
1,Shamol Singho,UC8Hl6YWHnVvC3aluFzgu_IQ,UgwHNIIqu9qeXFmlnax4AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwnjfRjST3R4IK7Kl...,জয়তু শেখ হাসিনা,3 weeks ago,0
2,Anoop Anu,UC2IkDAAJKTeO3ClaNuQHXHQ,UgyrpSuabnF-7oHKaSB4AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwnhvhxL9bhLJd3z7...,Cricket six gol 😜😜😜,2 weeks ago,0
3,Md Gaour,UCJ0rbHG-NrR8iGaWHlizMLQ,Ugw6F7nk9z7DqiY1AWl4AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwnjXimv7hNWTX21d...,তবে যতক্ষণ পর্যন্ত নিজেকে নিজে পরিবর্তন না করত...,2 weeks ago,0
4,Md Gaour,UCJ0rbHG-NrR8iGaWHlizMLQ,UgymHZARScenFjRx5KZ4AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwnjXimv7hNWTX21d...,তবে চেষ্টার মাধ্যমে ভাগ্য নির্ধারণ হয় আল্লায়...,2 weeks ago,0
5,Imam azhar,UCJQZIQz3A32BD1PHy63J4Ww,Ugywcqw7UXYxKkulFFl4AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwniYFe2uvzmGxZox...,I love my bangladesh biban,2 weeks ago,0
6,Rubel Rana,UCFhBnI4EVJnxEFP-3mJ4xqA,UgziVVeeR0gX8hBD9JN4AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwnj_BMQQNXZliLTM...,Mother of Mafia Sheikh Hasina.We want democrac...,2 weeks ago,0
7,All R O U N D E R's,UC0gOZuBuQjDOf0u6YCx_p-w,UgygPey4y9eiT8kG9B54AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwngEiswWy8FuEjBx...,ভাঙা বিমান,2 weeks ago,0
8,monir ahmead,UCGnTfCR2Rl79VwlhrAfjryg,UgyMEovJUdehRIi7a6h4AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwniT7uHL-OJgjhxH...,ইতিহাস কথা বলে!!! ৩ মে ১৯৮১। দিল্লীতে নির্বাস...,2 weeks ago,0
9,akash skh,UCNvTAnXYKI9z5S8bA_SWwng,UgyJOXLvn9SOu1hvMNl4AaABAg,0.0,https://yt3.ggpht.com/ytc/AAUvwnhbt9kgX6YDfATI...,2021 সালে কে কে এইটা দেখতেছো???,2 weeks ago,1
