In [1]:
import requests
from bs4 import BeautifulSoup
import urllib
import pickle
import os

import google.oauth2.credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

import googleapiclient.discovery

from nltk import word_tokenize
from nltk.corpus import stopwords

import pymorphy2

import pandas as pd

In [2]:
CLIENT_SECRETS_FILE = "../../client_secret.json"
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'

In [3]:
def get_authenticated_service():
    credentials = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            credentials = pickle.load(token)
    #  Check if the credentials are invalid or do not exist
    if not credentials or not credentials.valid:
        # Check if the credentials have expired
        if credentials and credentials.expired and credentials.refresh_token:
            credentials.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                CLIENT_SECRETS_FILE, SCOPES)
            credentials = flow.run_console()
 
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(credentials, token)
 
    return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)
 

In [4]:
service = get_authenticated_service()

In [5]:
def get_video_comments(service, **kwargs):
    summary = []
    results = service.commentThreads().list(**kwargs).execute()
    
    while results:
        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            num_like = item['snippet']['topLevelComment']['snippet']['likeCount']
            num_reply = item['snippet']['totalReplyCount']
            summary.append({
                'comment' : comment,
                'num_like' : num_like,
                'num_reply' : num_reply
            })
 
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            results = service.commentThreads().list(**kwargs).execute()
        else:
            break
 
    return pd.DataFrame(summary)

In [33]:
def check_word(word):
    if word.isalpha() and word not in stop:
        return morph.parse(word)[0].normal_form

    
def tokenize_sentence(sentence):
    words = word_tokenize(sentence)
    norm_words = []
    for word in words:
        cur_word = check_word(word)
        if cur_word is not None:
            norm_words.append(cur_word)
    return norm_words


def add_tokenize_columns(df, tokenized_col='tokenize_text', text_for_tokenize='comment', language='russian'):
    stop = set(stopwords.words(language))
    morph = pymorphy2.MorphAnalyzer()
    df[tokenized_col] = df[text_for_tokenize].apply(tokenize_sentence)
    return df

In [40]:
summary = get_video_comments(service, part='snippet', videoId='kOvdx1sz--U', textFormat='plainText')

In [42]:
%%timeit
add_tokenize_columns(summary)

2.8 s ± 375 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [45]:
summary = add_tokenize_columns(summary)

In [49]:
summary['tokenize_text'].to_list()

[['самоизоляция',
  'ввести',
  'протолкнуть',
  'поправка',
  'который',
  'никто',
  'проголосовать',
  'изоляция',
  'ждать',
  'июнь'],
 ['беспредел'],
 ['как', 'губер'],
 ['собирать',
  'толпа',
  'проверять',
  'пропуск',
  'поход',
  'мэр',
  'кончить',
  'хуесос',
  'весь',
  'власть',
  'цепной',
  'пёс',
  'роль',
  'сми',
  'исполнитель',
  'власть'],
 ['в', 'трава', 'сидеть', 'кузнечик', 'огуречик', 'неумненький'],
 ['он', 'просто', 'тот', 'подойти', 'артист', 'наверно', 'площадка'],
 ['мразь', 'маска', 'ещё', 'животный', 'пугать'],
 ['малодец'],
 ['классный',
  'правильный',
  'пацан',
  'уважуха',
  'такой',
  'этот',
  'тварь',
  'власть',
  'уровень',
  'охерель',
  'конец'],
 ['пойти',
  'нахуй',
  'мэр',
  'парень',
  'молодец',
  'заниматься',
  'спорт',
  'одиночка',
  'уебок',
  'толпа',
  'ходить',
  'заебывать',
  'человек',
  'мусор',
  'казак'],
 ['чувак',
  'красава',
  'сразу',
  'дать',
  'понять',
  'мера',
  'нужно',
  'работать',
  'херней',
  'заниматься