In [52]:
from pytube import YouTube
import pandas as pd
import numpy as pd
from xml.etree import ElementTree as ET
import bleach
import re


from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [15]:
def get_transcript(url):
    path=url
    yt = YouTube(path)
    caption = yt.captions.get_by_language_code('en')
    xml=caption.xml_captions
    root = ET.fromstring(xml)
    #gets the transcripts
    doc=''
    for child in root:
        doc=doc+" "+(child.text)
    return doc.replace('\n',' ')

In [93]:
def make_corpus(url_list):
    corpus=[]
    for url in url_list:
        x=bleach.clean(get_transcript(url), tags=[], attributes={}, styles=[], strip=True)
        y=re.sub(r'&#39;', '', x)
        doc=re.sub(r'\[Music]', '', y)
        corpus.append(doc)
    
    return corpus
    

In [25]:
#3blueonebrown, gameranx review of metro exdus, vice news yemen
url_list=('https://www.youtube.com/watch?v=jsYwFizhncE',
        'https://www.youtube.com/watch?v=fdaVySF_-FQ&feature=youtu.be',
        'https://www.youtube.com/watch?v=RWOPlynTcmk')

In [94]:
data=make_corpus(url_list)

In [98]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")

count_vectorizer.fit(data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [99]:
# Create the term-document matrix
# Transpose it so the terms are the rows
doc_word = count_vectorizer.transform(data).transpose()

In [100]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word)

In [101]:
doc_word.shape

(3967, 3)

In [102]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [103]:
len(id2word)

3967

In [114]:
lda = models.LdaModel(corpus=corpus, num_topics=4, id2word=id2word, passes=5)

2019-02-26 16:41:40,104 : INFO : using symmetric alpha at 0.25
2019-02-26 16:41:40,106 : INFO : using symmetric eta at 0.25
2019-02-26 16:41:40,109 : INFO : using serial LDA version on this node
2019-02-26 16:41:40,113 : INFO : running online (multi-pass) LDA training, 4 topics, 5 passes over the supplied corpus of 3 documents, updating model once every 3 documents, evaluating perplexity every 3 documents, iterating 50x with a convergence threshold of 0.001000
2019-02-26 16:41:40,210 : INFO : -10.354 per-word bound, 1309.2 perplexity estimate based on a held-out corpus of 3 documents with 5525 words
2019-02-26 16:41:40,211 : INFO : PROGRESS: pass 0, at document #3/3
2019-02-26 16:41:40,231 : INFO : topic #0 (0.250): 0.004*"like" + 0.004*"just" + 0.003*"circle" + 0.003*"game" + 0.003*"mass" + 0.002*"know" + 0.002*"little" + 0.002*"think" + 0.002*"theta" + 0.002*"angle"
2019-02-26 16:41:40,234 : INFO : topic #1 (0.250): 0.004*"just" + 0.004*"like" + 0.004*"game" + 0.002*"really" + 0.002*

In [115]:
lda.print_topics()

2019-02-26 16:41:40,798 : INFO : topic #0 (0.250): 0.006*"theta" + 0.006*"circle" + 0.005*"mass" + 0.005*"pi" + 0.005*"block" + 0.004*"angle" + 0.004*"momentum" + 0.004*"value" + 0.004*"energy" + 0.004*"blocks"
2019-02-26 16:41:40,800 : INFO : topic #1 (0.250): 0.009*"game" + 0.007*"like" + 0.005*"really" + 0.005*"metro" + 0.004*"just" + 0.004*"know" + 0.004*"games" + 0.003*"people" + 0.003*"time" + 0.003*"stuff"
2019-02-26 16:41:40,802 : INFO : topic #2 (0.250): 0.001*"like" + 0.001*"just" + 0.001*"theta" + 0.001*"think" + 0.001*"circle" + 0.001*"pi" + 0.001*"little" + 0.001*"people" + 0.000*"value" + 0.000*"energy"
2019-02-26 16:41:40,805 : INFO : topic #3 (0.250): 0.007*"just" + 0.005*"think" + 0.003*"coalition" + 0.003*"like" + 0.003*"people" + 0.003*"im" + 0.003*"yeah" + 0.003*"fighting" + 0.003*"data" + 0.002*"know"


[(0,
  '0.006*"theta" + 0.006*"circle" + 0.005*"mass" + 0.005*"pi" + 0.005*"block" + 0.004*"angle" + 0.004*"momentum" + 0.004*"value" + 0.004*"energy" + 0.004*"blocks"'),
 (1,
  '0.009*"game" + 0.007*"like" + 0.005*"really" + 0.005*"metro" + 0.004*"just" + 0.004*"know" + 0.004*"games" + 0.003*"people" + 0.003*"time" + 0.003*"stuff"'),
 (2,
  '0.001*"like" + 0.001*"just" + 0.001*"theta" + 0.001*"think" + 0.001*"circle" + 0.001*"pi" + 0.001*"little" + 0.001*"people" + 0.000*"value" + 0.000*"energy"'),
 (3,
  '0.007*"just" + 0.005*"think" + 0.003*"coalition" + 0.003*"like" + 0.003*"people" + 0.003*"im" + 0.003*"yeah" + 0.003*"fighting" + 0.003*"data" + 0.002*"know"')]

In [116]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[corpus]
lda_corpus

<gensim.interfaces.TransformedCorpus at 0x1a1e822940>

In [117]:
# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

In [120]:
# Check out the document vectors in the topic space for the first 5 documents
lda_docs[0:4]

[[(0, 0.9996092)], [(1, 0.9995075)], [(3, 0.9996346)]]

In [123]:
lda_docs[0]

[(0, 0.9996092)]