In [12]:
from pytube import YouTube
import pandas as pd
import numpy as pd
from xml.etree import ElementTree as ET
import bleach
import re

import gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

import pyLDAvis
from pyLDAvis import gensim as gensimvis
import spacy


from tqdm import tqdm
from pprint import pprint


In [13]:
def get_transcript(url):
    path=url
    yt = YouTube(path)
    caption = yt.captions.get_by_language_code('en')
    xml=caption.xml_captions
    root = ET.fromstring(xml)
    #gets the transcripts
    doc=''
    for child in root:
        doc=doc+" "+(child.text)
    return doc.replace('\n',' ')

In [14]:
def make_corpus(url_list):
    corpus=[]
    for url in url_list:
        x=bleach.clean(get_transcript(url), tags=[], attributes={}, styles=[], strip=True)
        y=re.sub(r'&#39;', '', x)
        doc=re.sub(r'\[Music]', '', y)
        corpus.append(doc)
    
    return corpus
    

In [15]:
#3blueonebrown, gameranx review of metro exdus, vice news yemen
url_list=('https://www.youtube.com/watch?v=jsYwFizhncE',
        'https://www.youtube.com/watch?v=fdaVySF_-FQ&feature=youtu.be',
        'https://www.youtube.com/watch?v=RWOPlynTcmk')

In [16]:
data=make_corpus(url_list)

In [17]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        

clean_sents = list(sent_to_words(data))

In [18]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(clean_sents, min_count=20) # higher threshold fewer phrases.

trigram = gensim.models.Phrases(bigram[clean_sents], min_count=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

In [27]:
clean_sents = [trigram_model[bigram_model[t]] for t in clean_sents]

In [28]:
nlp = spacy.load('en', disable=['parser', 'ner'])

# NOUN, ADJ, VERB, ADV
def lemmatization(texts, allowed_postags=['NOUN']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        
        output_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.is_stop == False]
        
        if len(output_text) > 0:
            texts_out.append(output_text)
    return texts_out

In [29]:
clean_sents = lemmatization(clean_sents)

In [30]:
for t in clean_sents[0:5]:
    print(f'{" ".join(t)} \n')

video puzzle setup block world friction collision meaning energy block wall block momentum wall block mass power time mass second example time fact number collision mass wall digit pi example collision time mass collision burst burst time video lot people solution attempt simulation description favorite place manner lesson phase space configuration space problem algorithm pi tactic field block velocity collision key conservation energy conservation momentum mass velocity variable process moment energy block value expression momentum block block block wall reality block momentum wall collision thinking wall mass momentum transfer wall equation unknown try drawing picture equation energy equation equation coordinate plane coordinate coordinate point plane encode pair velocity block case energy equation ellipse point ellipse pair velocity point correspond energy fact coordinate circle hunt pi coordinate represent example figure direction sqrt coordinate represent sqrt way conservation ene

LDA PArt

In [31]:
id2word = gensim.corpora.Dictionary(clean_sents)

corpus = [id2word.doc2bow(t) for t in clean_sents]

In [32]:
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=128,
                                           per_word_topics=True)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [33]:
pprint(lda_model.print_topics())

[(0,
  '0.002*"weapon" + 0.002*"fun" + 0.002*"forest" + 0.002*"job" + '
  '0.002*"supply" + 0.002*"bug" + 0.002*"good" + 0.002*"control" + '
  '0.002*"camp" + 0.002*"year"'),
 (1,
  '0.032*"game" + 0.024*"block" + 0.020*"time" + 0.017*"circle" + '
  '0.016*"collision" + 0.016*"point" + 0.014*"pi" + 0.014*"value" + '
  '0.013*"angle" + 0.013*"momentum"'),
 (2,
  '0.016*"coalition" + 0.013*"people" + 0.012*"child" + 0.010*"houthis" + '
  '0.010*"road" + 0.010*"fighter" + 0.009*"year" + 0.009*"day" + 0.009*"man" + '
  '0.009*"datum"')]


In [36]:
top_topics = lda_model.get_document_topics(corpus[0])
top_topics.sort(key=lambda x: x[1], reverse=True)

print(top_topics)

[(1, 0.99869645)]
