## Text Network

### Install dependencies

In [1]:
import re
import itertools
import spacy
from nltk import FreqDist, bigrams
import numpy as np
import pandas as pd
import networkx as nx
from pyvis.network import Network
import seaborn as sns
from collections import Counter
sns.set(rc={'figure.figsize':(15,7)})


from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from youtube_transcript_api import YouTubeTranscriptApi
from gensim.models import KeyedVectors

In [2]:
def generate_transcript(id):
	transcript = YouTubeTranscriptApi.get_transcript(id,languages=['en'])
	script = ""

	for text in transcript:
		t = text["text"]
		if t != '[Music]':
			script += t + " "
		
	return script, len(script.split())

iden = '-nZkP2b-4vo'

#ID é a parte final do link, depois do "=", exemplo 
#https://www.youtube.com/watch?v=L_jWHffIx5E
#L_jWHffIx5E

transcript, no_of_words = generate_transcript(iden)
print(transcript,no_of_words)

How did humanity come to accept rectangular pieces of pulped trees as something to spend eight to ten hours a day working for? It's a pretty insane story. This change from hard currency like gold or silver is a really huge deal. Without it, we couldn't possibly have the massive industrial and post-industrial economies we know today. This change revolutionized how we do business and forever altered how governments were financed. Learning about this massive sea change in how we as a species thought about money can help us reflect on our current historical shift from seeing paper as money, to seeing bits, seeing digital ones and zeros as money. But before we can get the exciting story of people trying to convince other people that paper was worth something, to understand why this is such a huge deal, we have to discuss a bit about how we thought about money before paper. If we go way back to the beginning of society, we find trade. Before early humans even really settled down, there's evi

In [3]:
transcript2 = YouTubeTranscriptApi.get_transcript(iden,languages=['en'])

# dicionário do momento de cada fala
legend_dict = {}
for text in transcript2:
    legend_dict[text['start']] = text['text']
    


In [4]:
modelo = KeyedVectors.load_word2vec_format("./../../wiki-news-300d-1M.vec")

### Auxiliary functions

In [5]:
toktok = ToktokTokenizer()
stpwrd = stopwords.words('english')
stop_list = (['uh', 'also', 'oh', 'um', 'yeah', 'use', 'lot', 'put', 'get', 'would', 'gonna', 'really', 'much', 'actually', 'another'])
stpwrd.extend(stop_list)

'''for word in stpwrd:
    print(word)
    '''
    
    
#Acha as N palavras mais similares a string passada e retorna uma lista com as palavras. (Sem a métrica de proximidade)
def getSimilar(string, N):
    similarList = modelo.most_similar(positive=[string])[0:N]
    wordList = []
    for word in similarList:
        wordList.append(word)
    return wordList

def sort_list(list1, list2):
     
    list3 = []
    for entry in list2:
        list3.append(entry[1])
    zipped_pairs = zip(list3, list1)
 
    z = [x for _, x in sorted(zipped_pairs, reverse = True)]
 
    return z



#Busca na label dos nós por palavras similares a stopwords
def banStopword(lista):
    SUSwords = []
    similarWords = []
    for word in lista:
        similares = getSimilar(word,5)
        mostSimilar = 0
        similarWord = ''
        SUSWord = ''
        for palavra in similares:
            sentinel = 0
            if palavra[0] in stpwrd and palavra[1] > mostSimilar:
                mostSimilar = palavra[1]
                similarWord = palavra
                SUSWord = word
                sentinel = 1
                print(mostSimilar, similarWord, SUSWord)
            if sentinel == 1:  
                SUSwords.append(word)
                similarWords.append(palavra)

    return SUSwords, similarWords

'''word_tokens = toktok.tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

filtered_sentence = []

for w in word_tokens:
	if w not in stop_words:
		filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)'''


def filtered_sentence(example_sent):
    word_tokens = toktok.tokenize(example_sent)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stpwrd]
    return " ".join(filtered_sentence)

def remove_stopwords(text):
    doc = filtered_sentence(text)
    print(" ".join([token.text for token in doc if token.is_stop == False]))
    return " ".join([token.text for token in doc if token.is_stop == False])

def pre_processing(text):
    text = filtered_sentence(text)
    text = re.sub(r'[^\w\s]', '', text) #remove punctuation
    text = re.sub(r"\s{2,}", " ", text)
    return text

def co_occurrence(text):
    corpus = list(itertools.chain.from_iterable([text.split(' ')]))
    vocab = list(set(corpus))
    vocab_index = {word: i for i, word in enumerate(vocab)}
    bi_grams = list(bigrams(corpus))
    bigram_freq = FreqDist(bi_grams).most_common(len(bi_grams)) 
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab))) 
    
    #bigram = ((word n, word n+1), num_occurrences)
    for bigram in bigram_freq:
        current = bigram[0][1] #word n+1
        previous = bigram[0][0] #word n
        count = bigram[1] #num_occurrences
        pos_current = vocab_index[current] #obtain id for word n+1 
        pos_previous = vocab_index[previous] #obtain id for word n 
        co_occurrence_matrix[pos_current][pos_previous] = count
        
    return co_occurrence_matrix, vocab_index

def structure_text_network(matrix, vocab_index):
    data_matrix = pd.DataFrame(matrix, index=vocab_index, columns=vocab_index)
    data_stack = data_matrix.stack()
    structure = data_stack[data_stack >= 1].rename_axis(('source', 'target')).reset_index(name='weight')
    return structure[(structure.source != " ") & (structure.target != " ")]

In [6]:
lista = ['one', 'measurements', 'what', 'also', 'me', 'mice', 'five', 'standard', 'stat', 'imagine', 'spread', 'weighed', 'look', 'mean', 'another', 'quest', 'quantifies', 'often', 'let', 'plot', 'set', 'deviation', 'data', 'measured', 'differences']

lista1, lista2 = banStopword(lista)

0.7335886359214783 ('another', 0.7335886359214783) one
0.6885168552398682 ('how', 0.6885168552398682) what
0.7364087700843811 ('a', 0.7364087700843811) another


In [7]:
print(lista1)
print('\n',lista2)
sort_list(lista1, lista2)

['one', 'what', 'another']

 [('another', 0.7335886359214783), ('how', 0.6885168552398682), ('a', 0.7364087700843811)]


['another', 'one', 'what']

In [8]:
getSimilar('one',5)

[('another', 0.7335886359214783),
 ('only', 0.7045549750328064),
 ('each', 0.6934645771980286),
 ('a', 0.6761848330497742),
 ('either', 0.6739281415939331)]

### Generate text network

In [9]:
from spacy.lang.pt.stop_words import STOP_WORDS
nlp = spacy.load("pt_core_news_sm")
#STOP_WORDS.add('gotta')

In [10]:
text = transcript

In [11]:
text_cleaned = pre_processing(text)
matrix, vocab = co_occurrence(text_cleaned)
structure_network = structure_text_network(matrix, vocab)

In [12]:
structure_network

Unnamed: 0,source,target,weight
0,cowries,Even,1.0
1,cowries,gold,1.0
2,target,back,1.0
3,hampering,inflation,1.0
4,worthless,something,1.0
...,...,...,...
733,still,Eh,1.0
734,Yap,island,1.0
735,Yap,limestone,1.0
736,Yap,giving,1.0


In [13]:
text_network = nx.DiGraph()
text_network = nx.from_pandas_edgelist(structure_network, 'target', 'source', ['weight'])

### PageRank

In [14]:
text_network = nx.DiGraph()
text_network = nx.from_pandas_edgelist(structure_network, 'target', 'source', ['weight'])

In [15]:
def pagerank_centrality(G):
    centrality =  nx.pagerank(G)
    return sorted(centrality.items(), key=lambda x: x[1], reverse=False)

In [16]:
centrality = pagerank_centrality(text_network)

#dicionário da centralidade
cen_dict = {}
max_cen = 0
for x in centrality:
    cen_dict[x[0]] = x[1]
    if max_cen < x[1]:
        max_cen = x[1]

### Graph Visualization

In [17]:
# lista dos nós
name_lst = [x[0] for x in centrality]

# lista de nós a serem removidos, estamos mantendo os 25 maiores
rmv_lst = name_lst[:-25]

# exibição do nome dos nós para a visualização
for i in text_network.nodes:
    text_network.nodes[i]['label'] = i

# redução do grafo mantendo a conectividade
for i in rmv_lst:
    maxi = 0
    for j in text_network.neighbors(i):
        w = name_lst.index(j)
        if maxi < w:
            maxi = w
            merge_node = j
            
    for j in text_network.neighbors(i):
        if not (j == merge_node or (j in text_network.neighbors(merge_node))):
            text_network.add_edge(merge_node, j, weight=text_network[i][j]['weight']/2)
    text_network.remove_node(i)

In [18]:
# momentos em que cada palavra é dita
dict_moments = {}
episilon = 45

for i in text_network.nodes:
    dict_moments[i] = []
    
for i in list(legend_dict.keys()):
    phrase = pre_processing(legend_dict[i]).lower()
    
    for j in text_network.nodes:
        if len(dict_moments[j]) == 0 or (j in phrase.split() and dict_moments[j][-1]+episilon<i):
            dict_moments[j].append((i>1)*(int(i)-1))
            print(f"{j} was appended at {i}")
    
dict_moments['trade']        

gold was appended at 0.0
something was appended at 0.0
one was appended at 0.0
money was appended at 0.0
used was appended at 0.0
often was appended at 0.0
problem was appended at 0.0
time was appended at 0.0
value was appended at 0.0
currency was appended at 0.0
started was appended at 0.0
trade was appended at 0.0
worth was appended at 0.0
think was appended at 0.0
commodity was appended at 0.0
paper was appended at 0.0
people was appended at 0.0
enough was appended at 0.0
massive was appended at 0.0
even was appended at 0.0
run was appended at 0.0
want was appended at 0.0
like was appended at 0.0
somebody was appended at 0.0
good was appended at 0.0
money was appended at 49.62
paper was appended at 49.62
trade was appended at 55.92
even was appended at 57.02
started was appended at 64.9
often was appended at 73.44
think was appended at 73.44
problem was appended at 82.3
run was appended at 82.3
people was appended at 95.08
trade was appended at 100.04
want was appended at 105.76
som

[0, 54, 99, 152, 332]

In [19]:
nt = Network('800px', '1000px', notebook=True)
nt.from_nx(text_network)
for edg in nt.edges:
    # espessura e física das arestas
    #edg['width'] = text_network[i][j]['weight']**4
    #if edg['weight'] < 1:
    
    if edg['from'] == edg['to']:
        edg['hidden'] = True
    edg['physics'] = False

for n in nt.nodes:
    # tamanho dos nós
    n['shape'] = 'ellipse'
    size = 30*(cen_dict[n['label']]/max_cen)**0.4
    n['font'] = str(int(size))+'px arial white'
    n['color'] = '#7f333f'
    n['labelHighlightBold'] = True
    

def time_to_min(time):
    return str(time//60)+":"+"0"*(time%60<10)+str(time%60)

for n in nt.nodes:
    # links nos nós
    link_list = ["<a href='https://www.youtube.com/watch?v="+str(iden)+
                 "&t="+str(time)+"s' target='_blank' rel='noopener noreferrer'>"+time_to_min(time)+"<br>" for time in dict_moments[n['label']]]
    soma = ''
    for k in link_list:
        soma += k
    n['title'] = soma
    #print(n['size'])

    
nt.show("text_network.html")

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 


In [20]:
#print(nx.adjacency_matrix(text_network))
#print(nt.nodes)
lista = []
for node in nt.nodes:
    lista.append(node['label'])
print(lista)

['gold', 'paper', 'like', 'worth', 'money', 'value', 'people', 'want', 'trade', 'massive', 'often', 'even', 'commodity', 'enough', 'currency', 'think', 'something', 'one', 'used', 'time', 'problem', 'run', 'started', 'good', 'somebody']


In [21]:
SUS = banStopword(lista)
print(SUS)

0.6788149476051331 ('too', 0.6788149476051331) enough
0.7335886359214783 ('another', 0.7335886359214783) one
(['enough', 'one'], [('too', 0.6788149476051331), ('another', 0.7335886359214783)])
