In [54]:
import os
import numpy as np
import json

import nltk
nltk.download('punkt')

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')  + ['shh', 'hoo', 'boo', 'uhoh', 'aah', 'heh', 'huh', 'ooh', 'yo', 'uh', 'um', 'aaah']

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

[nltk_data] Downloading package punkt to /home/angelo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
path = "/home/angelo/Desktop/master_offline/progettone_github/Animation-studios-analysis/sottotitoli/clean_subtitles/Disney/Animation/"

all_text = []
for r, d, files in os.walk(path):
    for f in files:
        filename = r+'/'+f
        with open(filename, 'r', encoding ='utf-8', errors='ignore') as fh:
            text = fh.read()
            all_text.append(text)

In [3]:
len(all_text)

88

# sentence splitting

In [4]:
one_document = ' '.join(all_text).lower()
one_document = one_document.replace('\n', ' ')
sentences = sent_tokenize(one_document)

In [5]:
len(sentences)

111766

In [6]:
sentences[0]

'long ago, in the faraway land of ancient greece, there was a golden age of powerful gods and extraordinary heroes.'

In [7]:
tok_sents = [[word for word in word_tokenize(sentence) if (word.isalpha())] for sentence in sentences]

# WORD2VEC

In [37]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(tok_sents, size=100, window=10, min_count=15, sg=0, iter=20, negative=10)

In [38]:
#learned vocabulary
words = list(w2v_model.wv.vocab)
len(words)

2725

In [39]:
w2v_model.wv.most_similar(['aurora'])

[('hail', 0.878734827041626),
 ('princess', 0.7684910893440247),
 ('queen', 0.7415786385536194),
 ('atta', 0.6923928260803223),
 ('health', 0.6773939728736877),
 ('beloved', 0.6712743043899536),
 ('yudhishtir', 0.6166546940803528),
 ('arendelle', 0.6105291247367859),
 ('gift', 0.6014747619628906),
 ('king', 0.5851573944091797)]

In [40]:
w2v_model.wv.most_similar(['king', 'princess'], ['prince'])

[('aurora', 0.6163120269775391),
 ('hail', 0.6024869680404663),
 ('queen', 0.5988205075263977),
 ('duryodhan', 0.5197397470474243),
 ('england', 0.5112175345420837),
 ('gods', 0.4852094054222107),
 ('arendelle', 0.4735652804374695),
 ('gift', 0.4692055881023407),
 ('evil', 0.46721726655960083),
 ('arrived', 0.44493749737739563)]

In [41]:
w2v_model.wv.similarity('king', 'majesty')

0.39076564

# Similarity matrix

In [42]:
voc = set(w2v_model.wv.vocab)
len(voc)

2725

In [43]:
voc = voc.difference(en_stopwords)
voc = set([w for w in voc if len(w) > 2])
len(voc)

2544

In [44]:
indici = dict()
for index, key in enumerate(voc):
    if key not in indici:
        indici[key] = index
        
reverse_indici = dict()
for term, termID in indici.items():
    reverse_indici[termID] = term

# Similarity graph

In [45]:
#costruzione matrice
graph_matrix = np.zeros(shape=(len(voc), len(voc)), dtype=np.float32)

for index1, key1 in enumerate(voc):
    for index2, key2 in enumerate(voc):
        graph_matrix[index1 ][index2] = w2v_model.wv.similarity(key1, key2)
        
len(graph_matrix)

2544

In [46]:
#constructing the list of tuples
graph_list = []
nodes = set() # <-- NUOVO
for x in range(len(graph_matrix)):
    for y in range(x+1, len(graph_matrix)):
        if graph_matrix[x][y] >= 0.7 and x != y:
            nodes |= set([reverse_indici[x], reverse_indici[y]])
            graph_list.append((reverse_indici[x],reverse_indici[y],graph_matrix[x][y]))      
print('nodes:', len(nodes))
print('links:', len(graph_list))

nodes: 277
links: 252


In [53]:
# NUOVO
graph = dict()
graph["nodes"] = [{"id": t} for t in nodes]
graph["links"] = [{"source":t[0], "target":t[1], "value":float(t[2])} for t in graph_list]
with open('data.json', 'w') as outfile:
    json.dump(graph, outfile)

## FastText

In [48]:
from gensim.models import FastText
fast_model = FastText(tok_sents,size=100, window=10, min_count=5, sg=1, iter=20, negative=10)

In [49]:
fast_model.wv.most_similar(['apple'])

[('applause', 0.692295491695404),
 ('triple', 0.6410434246063232),
 ('example', 0.6281914710998535),
 ('profile', 0.6031306982040405),
 ('purple', 0.5845595598220825),
 ('juicy', 0.5844345092773438),
 ('urgh', 0.5588231682777405),
 ('butch', 0.5545732975006104),
 ('pleakley', 0.5543082356452942),
 ('dimple', 0.5535545945167542)]

In [50]:
fast_model.wv.most_similar(['king'])

[('richard', 0.7351991534233093),
 ('viking', 0.6512148976325989),
 ('hastinapur', 0.6213502287864685),
 ('kingdom', 0.60843425989151),
 ('arthur', 0.6071090698242188),
 ('england', 0.5991268754005432),
 ('peeking', 0.5802225470542908),
 ('yudhishtir', 0.5742478966712952),
 ('aurora', 0.5734056830406189),
 ('stefan', 0.564886212348938)]

In [51]:
fast_model.wv.most_similar(['mirror'])

[('symbol', 0.6093048453330994),
 ('district', 0.582632303237915),
 ('greeneyed', 0.5801243782043457),
 ('tamatoa', 0.5760154128074646),
 ('ahtohallan', 0.5700957775115967),
 ('tomb', 0.5685678124427795),
 ('triumph', 0.5677741765975952),
 ('carpet', 0.5659788250923157),
 ('woohoooo', 0.565862238407135),
 ('volcano', 0.5652320384979248)]

In [52]:
fasttext_wAp_analogy = fast_model.wv.evaluate_word_analogies(test_file, dummy4unknown=True)

NameError: name 'test_file' is not defined

# google precomputed word embeddings

In [None]:
import gensim
google_w2v_file = 'GoogleNews-vectors-negative300.bin'
google_model = gensim.models.KeyedVectors.load_word2vec_format(google_w2v_file,binary=True)

In [None]:
google_model.most_similar(['king','female'],['male'])

In [None]:
w2v_large_analogy = google_model.wv.accuracy(test_file)