In [1]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [2]:
import numpy as np
import pandas as pd

In [3]:
from googleapiclient.discovery import build

api_key = "" # Removed

resource = build('youtube', 'v3', developerKey=api_key)

In [4]:
from urllib.parse import urlparse

In [5]:
import re

In [6]:
def get_yt_video_id(url):
    parts = urlparse(url)
    match = re.search('v=',parts.query)
    if match is None:
        return(url[-11:])
    return(parts.query[match.end():match.end()+11])
    

In [7]:
# Comedy video: https://www.youtube.com/watch?v=dtWrAU3qJjM&list=LL&index=6
# Small Tutorial video: youtube.com/watch?v=UV1ZF6pyhtM&list=WL&index=2
# Tutorial video 2: https://youtu.be/7Tlk3Gql-Wg
# Music Video: https://youtu.be/BddP6PYo2gs
video_id= get_yt_video_id('youtube.com/watch?v=D2V1okCEsiE')

In [8]:
def get_comments_faster(resource, video_id, comments=[], token='',count=0):
    
    video_response=resource .commentThreads().list(part='snippet',
                                               videoId=video_id,
                                               pageToken=token,maxResults= 100).execute()
    for item in video_response['items']:
        comment = item['snippet']['topLevelComment']
        text = comment['snippet']['textDisplay']
        comments.append(text)
    if "nextPageToken" in video_response and count<100:
        count = count+1
        return get_comments_faster(resource, video_id, comments, video_response['nextPageToken'],count)
    else:
        return comments

In [9]:
def get_comments(resource, video_id, comments=[], token=''):
    video_response=resource .commentThreads().list(part='snippet',
                                               videoId=video_id,
                                               pageToken=token).execute()
    for item in video_response['items']:
        comment = item['snippet']['topLevelComment']
        text = comment['snippet']['textDisplay']
        comments.append(text)
    if "nextPageToken" in video_response:
        return get_comments(resource, video_id, comments, video_response['nextPageToken'])
    else:
        return comments

In [10]:
video_id

'D2V1okCEsiE'

In [11]:
# comment_threads = get_comments(resource,video_id,comments=[])
comment_threads = get_comments_faster(resource,video_id,comments=[],count=0)
print(len(comment_threads))

146


In [12]:
pos_reviews = 0
neg_reviews = 0
neutral_reviews = 0
for comment_text in comment_threads:
    if(sid.polarity_scores(comment_text)['compound']<= -0.05):
        neg_reviews = neg_reviews+1
    elif(sid.polarity_scores(comment_text)['compound'] >= 0.05):
        pos_reviews = pos_reviews+1
    else:
        neutral_reviews = neutral_reviews+1
print(f"Positive Reviews: {pos_reviews}\nNegative Reviews: {neg_reviews}\nNeutral Reviews: {neutral_reviews}")
    

Positive Reviews: 119
Negative Reviews: 8
Neutral Reviews: 19


### Topic Modeling using LDA

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
cv = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')

In [30]:
dtm = cv.fit_transform(comment_threads)

In [31]:
dtm

<146x148 sparse matrix of type '<class 'numpy.int64'>'
	with 723 stored elements in Compressed Sparse Row format>

In [32]:
from sklearn.decomposition import LatentDirichletAllocation

In [33]:
LDA = LatentDirichletAllocation(n_components=5,random_state=42)

In [34]:
LDA.fit(dtm)

LatentDirichletAllocation(n_components=5, random_state=42)

In [35]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['providing', 'wow', 'tfidf', 'making', 'explained', 'simple', 'amazing', 'helpful', 'clear', 'sir', 'great', 'video', 'thanks', 'explanation', 'thank']


THE TOP 15 WORDS FOR TOPIC #1
['total', 'said', 'number', 'right', 'sentence', 'tf', 'word', 'words', 'br', 'use', 'idf', 'boy', 'girl', 'good', 'quot']


THE TOP 15 WORDS FOR TOPIC #2
['just', 'doing', 'videos', 'data', 'quot', 'implementation', 'say', 'exam', 'looking', 'forward', 'tfidf', 'hey', 'watching', 'great', 'krish']


THE TOP 15 WORDS FOR TOPIC #3
['work', 'make', 'language', 'krish', 'idf', 'tf', 'video', 'words', 'br', 'thanks', 'videos', 'really', 'sir', 'good', '39']


THE TOP 15 WORDS FOR TOPIC #4
['hello', 'values', 'okay', '39', 'thanks', 'thank', 'love', 'code', 'just', 'sir', 'understand', 'videos', 'br', 'idf', 'tf']




### Topic Modeling using Non-Negative Matrix Factorization

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tfidf = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')

In [23]:
dtm_nnm = tfidf.fit_transform(comment_threads)

In [24]:
dtm_nnm

<146x148 sparse matrix of type '<class 'numpy.float64'>'
	with 723 stored elements in Compressed Sparse Row format>

In [25]:
from sklearn.decomposition import NMF

In [26]:
nmf_model = NMF(n_components=5,random_state=42)

In [27]:
nmf_model.fit(dtm)



NMF(n_components=5, random_state=42)

In [28]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['bag', 'tf', 'video', 'explain', 'got', 'used', 'words', 'word', 'meaning', 'said', 'boy', 'br', 'girl', 'good', 'quot']


THE TOP 15 WORDS FOR TOPIC #1
['calculate', 'value', 'tf', 'left', 'sentence', 'explanation', 'just', 'video', 'meaning', 'word', 'words', 'girl', 'boy', 'good', '39']


THE TOP 15 WORDS FOR TOPIC #2
['watching', 'learning', 'machine', 'really', 'make', 'work', 'understand', 'knowledge', 'thank', 'concepts', 'krish', 'great', 'video', 'thanks', 'videos']


THE TOP 15 WORDS FOR TOPIC #3
['tutorial', 'correct', 'based', 'values', 'use', 'hi', 'krish', 'text', 'understand', 'processing', 'natural', 'language', 'br', 'tf', 'idf']


THE TOP 15 WORDS FOR TOPIC #4
['10', 'clear', 'really', 'tfidf', 'work', 'learning', 'explanation', 'learn', 'want', 'thank', 'make', 'lot', 'base', 'pls', 'sir']




