In [1]:
# get html and return subject polarity
# Find related videos from other channels to compare topic polarization
import os
import pandas as pd
from scipy import spatial
import spacy
from spacy import displacy
from spacy.tokens import Doc
import nltk
from nltk.chunk import conlltags2tree
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import config

nlp = spacy.load('en_core_web_lg')

key = os.path.join(config.dirs['apis'],'youtube_secret_key.json')
SCOPES = 'https://www.googleapis.com/auth/youtube.force-ssl'
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

def get_authenticated_service():
    flow = InstalledAppFlow.from_client_secrets_file(key, SCOPES)
    credentials = flow.run_console()
    return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)

def get_videos(service, **kwargs):
    final_results = []
    results = service.search().list(**kwargs).execute()
 
    i = 0
    max_pages = 3
    while results and i < max_pages:
        final_results.extend(results['items'])
        # Check if another page exists
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            results = service.search().list(**kwargs).execute()
            i += 1
        else:
            break
 
    return final_results

def search_videos_by_keyword(service, **kwargs):
    results = get_videos(service, **kwargs)
    final_results = []
    for item in results:
        title = item['snippet']['title']
        video_id = item['id']['videoId']
        comments = get_video_comments(service, part='snippet', videoId=video_id, textFormat='plainText')
        final_results.extend([(video_id, title, comment) for comment in comments])
    return final_results

def find_channel_id(service, search_term):
    request = service.channels().list(
    forUsername=search_term,
    part='id,snippet'
    ).execute()
    channel_id = request['items'][0]['id']
    return channel_id

def search_channels(service, search_term):
    request = service.search().list(
        q=search_term,
        part='id,snippet',
        type='channel',
        maxResults='20'
    ).execute()
    
    channel_df = pd.DataFrame(columns=['channel_id','channel'])
    for results in request:
        chan = request['items'][0]['channelTitle']
        c_id = request['items'][0]['channelId']
        channel_df = channel_df.append(pd.Series([chan, c_id],index=channel_df.columns), ignore_index = True)
    
    return channel_df

def get_video_details(service, video_id):
    # takes a video_id and returns the video title, channel, and description

def get_video_comments(service, **kwargs):
    # results = service.commentThreads().list(
    #     part='snippet',
    #     videoId='h2RzmSAZ4Hc',
    #     textFormat='plainText'
    #     ).execute()
    # results
    comment_df = pd.DataFrame(columns=['author','comment','likes'])
    results = service.commentThreads().list(**kwargs).execute()
 
    while results:
        for item in results['items']:
            author = item['snippet']['topLevelComment']['snippet']['authorChannelId']
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            likes = item['snippet']['topLevelComment']['snippet']['likeCount']
            comment_df = comment_df.append(pd.Series([author,comment,likes],index=comment_df.columns), ignore_index = True)
 
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            results = service.commentThreads().list(**kwargs).execute()
        else:
            break
 
    return comment_df

os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
service = get_authenticated_service()



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\JP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=497586682045-b0ct39o38sflgjhjt09f63cc43uq26dk.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=CMSR8kw6OMfodzbAUk7nxpjY32DjIG&prompt=consent&access_type=offline&code_challenge=Q7Og_zXGaKgO2I8bQM_lIcWd65FSQE2_TDgrhG4VSOw&code_challenge_method=S256


Enter the authorization code:  4/qQGCKENHtLlWMM1oJDc4-XLsd0bAcpBa8hEU2f7dMWJYSqppU9g2aDM


In [2]:
#keyword = input('Enter a keyword: ')
#search_videos_by_keyword(service, q=keyword, part='id,snippet', eventType='completed', type='video')

comments = get_video_comments(service, part='snippet', videoId='h2RzmSAZ4Hc', textFormat='plainText')

In [5]:
comments

Unnamed: 0,author,comment,likes
0,{'value': 'UC3H3ZmgOxW7bsLwL7PP9z9Q'},USA lost the war in Afghanistan. USA don't hav...,0
1,{'value': 'UCQzxvdElu15Clp4nfwhbHqQ'},Afghans are in danger because India keeps chan...,0
2,{'value': 'UCy73WU9s9nX4ZKAdJ0CY-XQ'},We should pull out of the Middle East complete...,0
3,{'value': 'UC-O7Vn51XF98cdNfXYd1GoQ'},The primadonald is again talking of “the Harva...,0
4,{'value': 'UC7bZEBSqVyujNQWB0SCL8dA'},"send fredo,,",0


In [22]:
def get_video_details(service, video_id):
    # takes a video_id and returns the video title, channel, and description
    # video_details = get_video_details(service,'h2RzmSAZ4Hc')
    results = service.videos().list(part='snippet',id=video_id).execute()
    return results

video_details = get_video_details(service,'h2RzmSAZ4Hc')

In [42]:
def extract_video_description(video):
    try:
        description = video['items'][0]['snippet']['description']
        return description
    except:
        print('No description found')
    
def extract_video_title(video):
    try:
        description = video['items'][0]['snippet']['title']
        return description
    except:
        print('No title found')
        
extract_video_description(video_details)
extract_video_title(video_details)

"Fareed's take: How the US should leave Afghanistan"

In [None]:

from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentiment_analyzer = SentimentIntensityAnalyzer()
def polarity_scores(doc):
    return sentiment_analyzer.polarity_scores(doc.text)
 
Doc.set_extension('polarity_scores', getter=polarity_scores)
comment = nlp(comments.loc[18]['comment'])
subject_evaluation = pd.DataFrame(columns=['subjects','sent_sentiment'])
for sent in comment.sents:
    #displacy.render(sent, style="dep", jupyter=True)
    sent_sub = []
    subjects = []
    polarity = []
    has_subject = 0
    for chunk in sent.noun_chunks:
        if chunk.root.dep_ == 'nsubj':
            sent_sub.append(chunk.text)
            sent_sub.append([child for child in chunk.root.children])
            has_subject = 1
            
#     for chunk in sent.noun_chunks:
#         print(chunk.text)
#     for token in sent:
        
#         if token.dep_ == 'amod':
#             sent_sub.append(token.text)
#             has_subject = 1
#         if token.dep_ == 'compound':
#             sent_sub.append(token.text)
#             has_subject = 1
#         if token.dep_ == 'nsubj':
#             sent_sub.append(token.text)
#             has_subject = 1
            
    if has_subject == 0:
        sent_sub.append('None')
        
    subjects.append(sent_sub)
    polarity.append(polarity_scores(sent)['compound'])
    subject_evaluation = subject_evaluation.append(pd.Series([subjects, polarity], index=subject_evaluation.columns), ignore_index = True)

#subject_evaluation = subject_evaluation.append([subjects, polarity], ignore_index = True)
#subject_evaluation = subject_evaluation.append(pd.Series([subjects, polarity], index=subject_evaluation.columns), ignore_index = True)

subject_evaluation
            
comment = nlp(comments.loc[18]['comment'])
for chunk in comment.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text,[child for child in chunk.children])
    
for token in comment:
    print(token.text, token.dep_)


In [12]:
comments.loc[18]['comment']

"How  would you know? you've never been in uniform, have you?"

In [8]:

nlp = nlp(comments.loc[18]['comment'])
neuralcoref.add_to_pipe(nlp)

error: bad escape \p at position 257

In [56]:
request = service.search().list(
    q='cnn',
    part='id,snippet',
    eventType='completed',
    type='video',
    maxResults='3'
    )

In [57]:
request.execute()

{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/8JECkdoEKHsBXhzh2GiDD2b3ha4"',
 'items': [{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/3JAr-H8sSO2RWhQl5fEyAsN-vsA"',
   'id': {'kind': 'youtube#video', 'videoId': 'gMCGqsO_bEg'},
   'kind': 'youtube#searchResult',
   'snippet': {'channelId': 'UCef1-8eOpJgud7szVPlZQAQ',
    'channelTitle': 'CNN-News18',
    'description': 'Watch all the current, latest and breaking news only on CNN NEWS18 live TV. The one-stop destination for live news on politics, entertainment, sports, gadgets, ...',
    'liveBroadcastContent': 'none',
    'publishedAt': '2019-08-17T13:21:18.000Z',
    'thumbnails': {'default': {'height': 90,
      'url': 'https://i.ytimg.com/vi/gMCGqsO_bEg/default.jpg',
      'width': 120},
     'high': {'height': 360,
      'url': 'https://i.ytimg.com/vi/gMCGqsO_bEg/hqdefault.jpg',
      'width': 480},
     'medium': {'height': 180,
      'url': 'https://i.ytimg.com/vi/gMCGqsO_bEg/mqdefault.jpg',
      'width': 320}},
    'title': 'CNN-News18 LIVE |

In [129]:
request = service.channels().list(
    forUsername='fox',
    part='id,snippet'
    ).execute()
request
#channel_id = request['items'][0]['id']

{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/kS6ybUO2AbMbY6PJYQiTBQ57J3I"',
 'items': [{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/NWJOgvqPJGfhoKmLhgM2TxVzqMk"',
   'id': 'UCUTuhx9d0GDlQ2Gj1BmhpfA',
   'kind': 'youtube#channel',
   'snippet': {'customUrl': 'Fox',
    'description': '',
    'localized': {'description': '', 'title': 'Fox'},
    'publishedAt': '2005-08-05T22:32:24.000Z',
    'thumbnails': {'default': {'height': 88,
      'url': 'https://yt3.ggpht.com/a/AGF-l78et7f0i6LhnpPpBE1sv2_86w68YS6J8QXVoQ=s88-c-k-c0xffffffff-no-rj-mo',
      'width': 88},
     'high': {'height': 800,
      'url': 'https://yt3.ggpht.com/a/AGF-l78et7f0i6LhnpPpBE1sv2_86w68YS6J8QXVoQ=s800-c-k-c0xffffffff-no-rj-mo',
      'width': 800},
     'medium': {'height': 240,
      'url': 'https://yt3.ggpht.com/a/AGF-l78et7f0i6LhnpPpBE1sv2_86w68YS6J8QXVoQ=s240-c-k-c0xffffffff-no-rj-mo',
      'width': 240}},
    'title': 'Fox'}}],
 'kind': 'youtube#channelListResponse',
 'pageInfo': {'resultsPerPage': 5, 'totalResults': 1}}

In [95]:
request = service.channels().list(
    id=channel_id,
    part='id,snippet'
    ).execute()
request

{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/jtdZ-2M7os2H2NN2kekFXiKw5y8"',
 'items': [{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/xc82qIk3HuZwLAjind51YguaMQI"',
   'id': 'UCupvZG-5ko_eiXAupbDfxWw',
   'kind': 'youtube#channel',
   'snippet': {'customUrl': 'CNN',
    'description': "CNN operates as a division of Turner Broadcasting System, which is a subsidiary of Warner Media. CNN identifies itself as -- and is widely known to be - the most trusted source for news and information. The CNN umbrella includes nine cable and satellite television networks, two radio networks, the CNN Digital Network, which is the top network of news Web sites in the United States, and CNN Newsource, the world's most extensively syndicated news service. CNN is proud of our ability to bring you up-to-the-minute news from around the world, as a result of our many extensions.",
    'localized': {'description': "CNN operates as a division of Turner Broadcasting System, which is a subsidiary of Warner Media. CNN identifies

In [106]:
request = service.search().list(
    q='Mayor de Blasio',
    channelId=channel_id,
    part='id,snippet',
    eventType='completed',
    type='video',
    maxResults='3'
    )

In [107]:
request.execute()

{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/thxDYb21jTlPmuKT-C8lfsSfAds"',
 'items': [{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/-OKdNHJxw7IarRJfPoFF2lEjbXA"',
   'id': {'kind': 'youtube#video', 'videoId': 'IYoSGv5WeW4'},
   'kind': 'youtube#searchResult',
   'snippet': {'channelId': 'UCupvZG-5ko_eiXAupbDfxWw',
    'channelTitle': 'CNN',
    'description': "Governor Cuomo, Mayor de Blasio and New York Police Commissioner O'Neill join families and other guest at 9/11 ceremony.",
    'liveBroadcastContent': 'none',
    'publishedAt': '2018-09-11T16:36:40.000Z',
    'thumbnails': {'default': {'height': 90,
      'url': 'https://i.ytimg.com/vi/IYoSGv5WeW4/default.jpg',
      'width': 120},
     'high': {'height': 360,
      'url': 'https://i.ytimg.com/vi/IYoSGv5WeW4/hqdefault.jpg',
      'width': 480},
     'medium': {'height': 180,
      'url': 'https://i.ytimg.com/vi/IYoSGv5WeW4/mqdefault.jpg',
      'width': 320}},
    'title': '9/11 memorial ceremony in New York - September 11th, 2018'}}],
 'ki

In [126]:
def get_video_comments(service, **kwargs):
    # results = service.commentThreads().list(
    #     part='snippet',
    #     videoId='h2RzmSAZ4Hc',
    #     textFormat='plainText'
    #     ).execute()
    # results
    comment_df = pd.DataFrame(columns=['author','comment','likes'])
    results = service.commentThreads().list(**kwargs).execute()
 
    while results:
        for item in results['items']:
            author = item['snippet']['topLevelComment']['snippet']['authorChannelId']
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            likes = item['snippet']['topLevelComment']['snippet']['likeCount']
            comment_df = comment_df.append(pd.Series([author,comment,likes],index=comment_df.columns), ignore_index = True)
 
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            results = service.commentThreads().list(**kwargs).execute()
        else:
            break
 
    return comment_df



comments = get_video_comments(service, part='snippet', videoId='h2RzmSAZ4Hc', textFormat='plainText')

In [127]:
comments

Unnamed: 0,author,comment,likes
0,{'value': 'UCsKDUhOUUB4QEP31JMwkrkw'},RAPIST!!! Don Lemon needs to be FIRED!!,0
1,{'value': 'UCoetlyPkUiA7nLC_IsLVQ_w'},Cnn you should be ashamed of yourselfs,1
2,{'value': 'UCpfBxOREvwwX65EqYXRrLTQ'},I don't know who this dude is but I like him b...,0
3,{'value': 'UCoetlyPkUiA7nLC_IsLVQ_w'},What is this guy saying? Me not understanding.,1
4,{'value': 'UCh0wrWmkVP0XFOmldSEX77w'},I bet Fareed smells really nice!,0
5,{'value': 'UCM3zWu9-A-US03_1S0P-l7g'},Fire Fredo and Stinkfingers now!,0
6,{'value': 'UCYWV3wrMGZ8UooQglb4_d0Q'},HEY CNN TAKE HOMO CUOMO FREDO AND SHOVE HIS FI...,1
7,{'value': 'UCsTCnykuLx_FNrxxacxHNMw'},Let's just leave Afghanistan. Cold turkey. Let...,0
8,{'value': 'UCfaiEeOIX5IHIzaxW6oYGmw'},Are you guys going to blame the President for ...,2
9,{'value': 'UCxKmmYtXiycWG3v07_vZAgw'},Afghanistan is the graveyard of empires. I hav...,0


In [20]:
results = service.commentThreads().list(
    part='snippet',
    videoId='h2RzmSAZ4Hc',
    textFormat='plainText'
    ).execute()
results

{'etag': '"8jEFfXBrqiSrcF6Ee7MQuz8XuAM/Ch7sE3h6MU5KFBBhqwhFGzXuc-c"',
 'items': [{'etag': '"8jEFfXBrqiSrcF6Ee7MQuz8XuAM/NTYOXKHg5xAvPbJ7jfJtSf1VMbo"',
   'id': 'Ugxehoo8XuwLo-HhrX14AaABAg',
   'kind': 'youtube#commentThread',
   'snippet': {'canReply': True,
    'isPublic': True,
    'topLevelComment': {'etag': '"8jEFfXBrqiSrcF6Ee7MQuz8XuAM/WHs1LhsohpxUOSzJkKOLfjFJaI4"',
     'id': 'Ugxehoo8XuwLo-HhrX14AaABAg',
     'kind': 'youtube#comment',
     'snippet': {'authorChannelId': {'value': 'UC3H3ZmgOxW7bsLwL7PP9z9Q'},
      'authorChannelUrl': 'http://www.youtube.com/channel/UC3H3ZmgOxW7bsLwL7PP9z9Q',
      'authorDisplayName': 'Kobulione',
      'authorProfileImageUrl': 'https://yt3.ggpht.com/-2VTLASKKB_s/AAAAAAAAAAI/AAAAAAAAAAA/8FY3KrUrW_k/s28-c-k-no-mo-rj-c0xffffff/photo.jpg',
      'canRate': True,
      'likeCount': 0,
      'publishedAt': '2019-08-22T11:48:30.000Z',
      'textDisplay': "USA lost the war in Afghanistan. USA don't have ⚽ s to fight terrorism inside the pakistan.",
 

In [121]:
comment_df = pd.DataFrame(columns=['author','comment','likes'])

c = 'comment'
like = 'likes'
autr = 'authorsss'
comment_df.append(pd.Series([c,like,autr],index=comment_df.columns), ignore_index = True)


Unnamed: 0,author,comment,likes
0,comment,likes,authorsss


In [133]:
def search_channel(service, search_term):
    request = service.search().list(
        q=search_term,
        part='id,snippet',
        type='channel',
        maxResults='20'
    ).execute()
    
    channel_df = pd.DataFrame(columns=['channel_id','channel'])
    for results in request:
        chan = request['items'][0]['channelTitle']
        c_id = request['items'][0]['channelId']
        channel_df = channel_df.append(pd.Series([chan, c_id],index=channel_df.columns), ignore_index = True)
    
    return channel_df
        


def find_channel_id(service, search_term):
    request = service.channels().list(
    forUsername=search_term,
    part='id,snippet'
    ).execute()
    channel_id = request['items'][0]['id']
    return channel_id


comment_df = pd.DataFrame(columns=['author','comment','likes'])
    results = service.commentThreads().list(**kwargs).execute()
 
    while results:
        for item in results['items']:
            author = item['snippet']['topLevelComment']['snippet']['authorChannelId']
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            likes = item['snippet']['topLevelComment']['snippet']['likeCount']
            comment_df = comment_df.append(pd.Series([author,comment,likes],index=comment_df.columns), ignore_index = True)

request = service.search().list(
    q='cnn',
    part='id,snippet',
    type='channel',
    maxResults='20'
    )
request.execute()

{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/2S6eo54pG9JE-sVJqQDFyabAO5E"',
 'items': [{'etag': '"0UM_wBUsFuT6ekiIlwaHvyqc80M/f10hjuC3bLJD8gUed_U6_N0H5L0"',
   'id': {'channelId': 'UCupvZG-5ko_eiXAupbDfxWw', 'kind': 'youtube#channel'},
   'kind': 'youtube#searchResult',
   'snippet': {'channelId': 'UCupvZG-5ko_eiXAupbDfxWw',
    'channelTitle': 'CNN',
    'description': 'CNN operates as a division of Turner Broadcasting System, which is a subsidiary of Warner Media. CNN identifies itself as -- and is widely known to be - the ...',
    'liveBroadcastContent': 'none',
    'publishedAt': '2005-10-02T16:06:36.000Z',
    'thumbnails': {'default': {'url': 'https://yt3.ggpht.com/-K12xTWC-rMI/AAAAAAAAAAI/AAAAAAAAAAA/2N_u5pcKB3w/s88-c-k-no-mo-rj-c0xffffff/photo.jpg'},
     'high': {'url': 'https://yt3.ggpht.com/-K12xTWC-rMI/AAAAAAAAAAI/AAAAAAAAAAA/2N_u5pcKB3w/s800-c-k-no-mo-rj-c0xffffff/photo.jpg'},
     'medium': {'url': 'https://yt3.ggpht.com/-K12xTWC-rMI/AAAAAAAAAAI/AAAAAAAAAAA/2N_u5pcKB3w/s240-c-

In [134]:
comments.tail()

Unnamed: 0,author,comment,likes
370,{'value': 'UC4hxNaN_FqjXslqnjvjZxoA'},Early boi,0
371,{'value': 'UCQD0yGciGUW_DUJGO6VFtBw'},I thought Trump said he would destroy ISIS in ...,9
372,{'value': 'UCKi71kdGB-9Wl5_EEDTQCsg'},one word LEAVE!!! problem solved!!!,6
373,{'value': 'UCZ880zph14jDpY_d82NmT3g'},Trump 2020 🇺🇸🇺🇸🇺🇸🇺🇸🇺🇸🇺🇸,8
374,{'value': 'UCLe9VXOeo18Yw9EsmlIwAyw'},Tariffs on China are justified.,7
