In [67]:
from pytube import YouTube
from pytube import Playlist
import pandas as pd
import numpy as np

from xml.etree import ElementTree as ET
import bleach
import re

import gensim
from gensim import corpora, models, similarities, matutils
from gensim.summarization import keywords
from gensim.summarization.summarizer import summarize
from gensim.parsing.preprocessing import remove_stopwords

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [68]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a,b):
    return dot(a, b)/(norm(a)*norm(b))

Helper Functions

In [69]:
def get_transcript(url):
    path=url
    try:
        yt = YouTube(path)
    except ValueError:
        print('cannot find video')
    caption = yt.captions.get_by_language_code('en')
    try:
        xml=caption.xml_captions
    except AttributeError:
        print('no captions or transcripts')

    root = ET.fromstring(xml)
    #gets the transcripts
    doc=''
    for child in root:
        try:
            doc=doc+" "+(child.text)
        except TypeError:
            pass
    return doc.replace('\n',' ')

In [70]:
def make_corpus(url_list):
    corpus=[]
    for url in url_list:
        x=bleach.clean(get_transcript(url), tags=[], attributes={}, styles=[], strip=True)
        y=re.sub(r'&#39;', '', x)
        z=re.sub(r'\[inaudible]', '', y)
        doc=re.sub(r'\[Music]', '', z)
       
        corpus.append(doc)
    
    return corpus

In [71]:
def oov(keys):
    keys2=[]
    for key in keys:
        if key in model.vocab:
            keys2.append(key)
    x=len(keys)-len(keys2)
    y=x*(sum(list(map(model.word_vec,keys2)))/len(keys2))
    vector=sum(list(map(model.word_vec,keys2)))+y
    return vector

In [109]:
def get_topic_space(url_list):
    docs=make_corpus(url_list)
    vectors_list=[]
    for i in range(len(docs)):
        keys=keywords(docs[i], words=5,pos_filter=('NN','NNS','NNPS','NNP',),lemmatize=True, split=True)
    
        try:
            vector=sum(list(map(model.word_vec,keys)))
        except KeyError:
            vector=oov(keys)
                
        vectors_list.append(vector)
   
    return (sum(vectors_list)/len(docs))
                     

In [115]:
def topic_analyze(url):
    analysis={}
    doc=make_corpus([url])
    x=get_topic_space([url])
    analysis['Border Wall']=cos_sim(x,wall_vector)
    analysis['Stormy Dan']=cos_sim(x,stormy_vector)
    analysis['Meuller']=cos_sim(x,mueller_vector)
    analysis['NBA']=cos_sim(x,NBA_vector)
    analysis['Pokemon']=cos_sim(x,poke_vector)
    clean_doc=remove_stopwords(doc[0])
    keys=keywords(clean_doc,words=5,pos_filter=('NN','NNS','NNPS','NNP',),lemmatize=True,split=True)
   
    print (keys)
    print(summarize(doc[0],ratio=.01))
    return analysis

In [8]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

glove_file = datapath('/Users/andrewportal/Downloads/glove/glove.6B.100d.txt')
tmp_file = get_tmpfile("glove_word2vec.txt")

# call glove2word2vec script
# default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, tmp_file)

model = KeyedVectors.load_word2vec_format(tmp_file)    

In [104]:
wall_keys=['border','wall','immigration','funding','promise']
wall_vector=sum(list(map(model.word_vec, wall_keys)))

In [105]:
mueller_keys=['probe','investigation','election', "collusion","interference"]
mueller_vector=sum((list(map(model.word_vec,mueller_keys))))

In [106]:
stormy_keys=['affair', 'president','lawyer','payment','campaign']
stormy_vector=sum((list(map(model.word_vec,stormy_keys))))

In [107]:
NBA_keys=['conference', 'basketball','league','championship','playoffs']
NBA_vector=sum(list(map(model.word_vec, NBA_keys)))

In [102]:
poke_keys=['pokemon','pikachu','nintendo','videogame','fun']
poke_vector=sum(list(map(model.word_vec, poke_keys)))


In [116]:
topic_analyze('https://youtu.be/jsYwFizhncE')

['circle', 'theta', 'block', 'masses', 'point']
If that first block has a mass which is some power of 100 times the mass of the second, for example 1,000,000 times as much, an insanely surprising fact popped out: The total number of collisions, including those between the second mass and the wall, has the same starting digits as pi.


{'Border Wall': 0.38666528,
 'Stormy Dan': 0.31513035,
 'Meuller': 0.32414573,
 'NBA': 0.35286573,
 'Pokemon': 0.07784927}

In [111]:
#Trump's wall playlist
pl=Playlist("https://www.youtube.com/playlist?list=PL-nbJMikieaX9joE-O0kj52VP0vqBDlXP")
pl.populate_video_urls()
url_list_wall=pl.video_urls
#Get trump's wall topic space

In [112]:
wall_space=get_topic_space(url_list_wall)
np.save('wall',wall_space)