In [None]:
from pytube import YouTube
from pytube import Playlist
import pandas as pd
import numpy as np

from xml.etree import ElementTree as ET
import bleach
import re

import gensim
from gensim import corpora, models, similarities, matutils
from gensim.summarization import keywords
from gensim.summarization.summarizer import summarize

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [None]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a,b):
    return dot(a, b)/(norm(a)*norm(b))

Helper Functions

In [None]:
def get_transcript(url):
    path=url
    try:
        yt = YouTube(path)
    except ValueError:
        print('cannot find video')
    caption = yt.captions.get_by_language_code('en')
    try:
        xml=caption.xml_captions
    except AttributeError:
        print('no captions or transcripts')

    root = ET.fromstring(xml)
    #gets the transcripts
    doc=''
    for child in root:
        try:
            doc=doc+" "+(child.text)
        except TypeError:
            pass
    return doc.replace('\n',' ')

In [None]:
def make_corpus(url_list):
    corpus=[]
    for url in url_list:
        x=bleach.clean(get_transcript(url), tags=[], attributes={}, styles=[], strip=True)
        y=re.sub(r'&#39;', '', x)
        z=re.sub(r'\[inaudible]', '', y)
        doc=re.sub(r'\[Music]', '', z)
        corpus.append(doc)
    
    return corpus

In [None]:
def oov(keys):
    keys2=[]
    for key in keys:
        if key in model.vocab:
            keys2.append(key)
    x=len(keys)-len(keys2)
    y=x*(sum(list(map(model.word_vec,keys2)))/len(keys2))
    vector=sum(list(map(model.word_vec,keys2)))+y
    return vector

In [None]:
def get_topic_space(url_list):
    docs=make_corpus(url_list)
    vectors_list=[]
    for i in range(len(docs)):
        keys=keywords(docs[i], words=10,lemmatize='True', split='true')
    
        try:
            vector=sum(list(map(model.word_vec,keys)))
        except KeyError:
            vector=oov(keys)
                
        vectors_list.append(vector)
   
    return (sum(vectors_list)/len(docs))
                     

In [None]:
def topic_analyze(url):
    analysis={}
    doc=make_corpus([url])
    x=get_topic_space([url])
    analysis['Border Wall']=cos_sim(x,wall_vector)
    analysis['Stormy Dan']=cos_sim(x,stormy_vector)
    analysis['Meuller']=cos_sim(x,mueller_vector)
    analysis['NBA']=cos_sim(x,NBA_vector)
    analysis['Pokemon']=cos_sim(x,poke_vector)
    keys=keywords(doc[0],words=5,pos_filter=('NN','NNS','NNPS','NNP'),scores='True', lemmatize='True')
   
    print (keys)
    print(summarize(doc[0],word_count=50))
    return analysis

In [None]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

glove_file = datapath('/Users/andrewportal/Downloads/glove/glove.6B.100d.txt')
tmp_file = get_tmpfile("glove_word2vec.txt")

# call glove2word2vec script
# default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, tmp_file)

model = KeyedVectors.load_word2vec_format(tmp_file)    

In [None]:
#Trump's wall playlist
pl=Playlist("https://www.youtube.com/playlist?list=PL-nbJMikieaX9joE-O0kj52VP0vqBDlXP")
pl.populate_video_urls()
url_list_wall=pl.video_urls
#Get trump's wall topic space

In [None]:
wall_space=get_topic_space(url_list_wall)
np.save('wall',wall_space)

In [None]:
#Topic=Pokemon
pl=Playlist("https://www.youtube.com/playlist?list=PLqimesWokesHzcUahlesZ5iBhND1fmWzQ")
pl.populate_video_urls()
url_list_poke=pl.video_urls

In [None]:
#get pokemon topic space
poke_space=get_topic_space(url_list_poke)
np.save('Poke',poke_space)

In [None]:
#Get Stormy danaiels topic

pl=Playlist("https://www.youtube.com/watch?v=xukWkOv4a-w&list=PLjBJYbCV8XMyflxLdH4KEg_8OEMCfvvmV")
pl.populate_video_urls()
url_list_stormy=pl.video_urls

In [None]:
stormy_space=get_topic_space(url_list_stormy)
np.save('stormy',stormy_space)

In [None]:
#Get Mueller Topics from a playlist
pl=Playlist("https://www.youtube.com/watch?v=pgzThHiYOD4&list=PLpgAjMbrNMIKn_IavsRKcmmePoNKMOkXI")
pl.populate_video_urls()
url_list_mueller=pl.video_urls


In [None]:
#get mueller topics from a list
url_list_mueller=['https://www.youtube.com/watch?v=pcq7Fo\
-E56M&index=8&list=PLpgAjMbrNMIKn_IavsRKcmmePoNKMOkXI&t=0s','https://www.youtube.com/watch?v=uCVDEEMZQec','\
https://www.youtube.com/watch?v=1M_CSsQas60']

In [None]:
#get topic space
meuller_space=get_topic_space(url_list_mueller)
np.save('mueller',mueller_space)

In [None]:
wall_keys=['border','wall','immigration','funding','promise']
wall_vector=sum(list(map(model.word_vec, wall_keys)))

In [None]:
mueller_keys=['probe','investigation','election', "collusion","interference"]
mueller_vector=sum(list(map(model.word_vec,mueller_keys)))

In [None]:
stormy_keys=['affair', 'president','lawyer','payment','campaign']
stormy_vector=sum(list(map(model.word_vec,stormy_keys)))

In [None]:
NBA_keys=['conference', 'basketball','league','championship','playoffs']
NBA_vector=sum(list(map(model.word_vec, NBA_keys)))

In [None]:
poke_keys=['pokemon','pikachu','nintendo','videogame','fun']
poke_vector=sum(list(map(model.word_vec, poke_keys)))


In [None]:
topic_analyze('https://www.youtube.com/watch?v=BbHLPBJvSOc&t=2701s')

In [None]:
import seaborn as sns
import matplotlib as plt

In [None]:
doc=make_corpus(['https://www.youtube.com/watch?v=jsYwFizhncE'])
cos=[]
for i in range(5,101,5):
        
    keys=keywords(doc[0],words=i,lemmatize='True', split='true')
    try:
        x=sum(list(map(model.word_vec,keys)))
    except KeyError:
        x=oov(keys)
    cos.append(cos_sim(x,poke_space))

In [None]:
y=cos

In [None]:
y=cos
x=list(range(5,101,5))
sns.set(style="ticks", rc={"lines.linewidth": 2})
ax=sns.pointplot(x,y,color = 'red')
ax.set_title('3blue1brown vs Pokemon')
ax.set_xlabel('Keywords')
ax.set_ylabel('Cos_Sim')






In [None]:
fig = ax.get_figure()
fig.savefig("plot2")

Create rolling average of keyword vectors to graph why it was a bad idea

In [None]:
#start trying to make rolling average
pl=Playlist("https://www.youtube.com/watch?v=WcD6jjLMZso&list=PLUXSZMIiUfFS3P3IcWk95yTOZdmUEI7C4")
pl.populate_video_urls()
url_list_nba=pl.video_urls

In [None]:
nba=make_corpus(url_list_nba)
    

In [None]:
blue=get_topic_space(['https://www.youtube.com/watch?v=jsYwFizhncE'])

In [None]:
vectors=[]
for i in range(len(poke)):
    keys=keywords(poke[i],words=5,lemmatize='True', split='true')
    try:
        x=sum(list(map(model.word_vec,keys)))
    except KeyError:
        x=oov(keys)
    vectors.append(x)
    

In [None]:
len(vectors)    

In [None]:
#Get running average of vectos
vec_c=[vectors[0]]
vec_avg=(vectors[0]+vectors[1])/2
vec_c.append(vec_avg)
for i in range(len(vectors)):
    if i>1:
        vec_avg=(vec_avg+vectors[i])/2
        vec_c.append(vec_avg)
    else:
        i=+1

In [None]:
len(vec_c)

In [None]:
results=[]
for i in range(len(vec_c)):
    results.append(cos_sim(vec_c[i],blue))
    

In [None]:
y=results
x=list(range(len(results)))
sns.set(style="ticks", rc={"lines.linewidth": 2})
ax=sns.pointplot(x,y,color = 'red')
ax.set_title('3blue1brown vs Pokemon')
ax.set_xlabel('Keywords')
ax.set_ylabel('Cos_Sim')
#end of steps to make rollling average for graph

In [None]:
def get_transcript_better(url):
    path=url
 
    yt = YouTube(path)
 
        
    caption = yt.captions.get_by_language_code('en')
    xml=caption.xml_captions
    root = ET.fromstring(xml)
    #gets the transcripts
    doc=''
    for child in root:
        try:
            doc=doc+" "+(child.text)
        except TypeError:
            pass
    return doc.replace('\n',' ')