In [75]:
import os
import numpy as np
import json

import nltk
nltk.download('punkt')

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')  + ['shh', 'hoo', 'boo', 'uhoh', 'aah', 'heh', 'huh', 'ooh', 'yo']

[nltk_data] Downloading package punkt to /home/angelo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
path = "/home/angelo/Desktop/master_offline/progettone_github/Animation-studios-analysis/sottotitoli/clean_subtitles/Disney/Animation/"

all_text = []
for r, d, files in os.walk(path):
    for f in files:
        filename = r+'/'+f
        with open(filename, 'r', encoding ='utf-8', errors='ignore') as fh:
            text = fh.read()
            all_text.append(text)

In [10]:
len(all_text)

88

# sentence splitting

In [12]:
one_document = ' '.join(all_text).lower()
one_document = one_document.replace('\n', ' ')
sentences = sent_tokenize(one_document)

In [13]:
len(sentences)

111766

In [42]:
sentences[0]

'long ago, in the faraway land of ancient greece, there was a golden age of powerful gods and extraordinary heroes.'

In [48]:
tok_sents = [[word for word in word_tokenize(sentence) if (word.isalnum() and word not in en_stopwords)] for sentence in sentences]

In [49]:
tok_sents[0]

['long',
 'ago',
 'faraway',
 'land',
 'ancient',
 'greece',
 'golden',
 'age',
 'powerful',
 'gods',
 'extraordinary',
 'heroes']

In [50]:
tok_sents[0:3]

[['long',
  'ago',
  'faraway',
  'land',
  'ancient',
  'greece',
  'golden',
  'age',
  'powerful',
  'gods',
  'extraordinary',
  'heroes'],
 ['greatest', 'strongest', 'heroes', 'mighty', 'hercules'],
 ['measure', 'true', 'hero']]

# WORD2VEC

In [51]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(tok_sents, size=200, window=10, min_count=15, sg=0, iter=20, negative=10)

In [53]:
#learned vocabulary
words = list(w2v_model.wv.vocab)
len(words)

2604

In [55]:
w2v_model.wv.most_similar(['aurora'])

[('hail', 0.9284411668777466),
 ('princess', 0.833216667175293),
 ('yudhishtir', 0.8153361082077026),
 ('atta', 0.8063619136810303),
 ('health', 0.8039485812187195),
 ('beloved', 0.8002051711082458),
 ('queen', 0.7850385904312134),
 ('wicked', 0.7272493839263916),
 ('mortal', 0.72542405128479),
 ('duryodhan', 0.716457724571228)]

In [57]:
w2v_model.wv.most_similar(['king', 'princess'], ['prince'])

[('hail', 0.6408109068870544),
 ('england', 0.6246001124382019),
 ('pumpkin', 0.6200451850891113),
 ('queen', 0.6188133955001831),
 ('health', 0.6033597588539124),
 ('aurora', 0.6011850237846375),
 ('yudhishtir', 0.555610716342926),
 ('throne', 0.54759681224823),
 ('kingdom', 0.5464192032814026),
 ('london', 0.5455605387687683)]

In [60]:
w2v_model.wv.similarity('king', 'majesty')

0.49290496

# constructin matrix for graph

In [62]:
voc = set(w2v_model.wv.vocab)
len(voc)

In [64]:
voc = voc.difference(en_stopwords)
len(voc)

2604

In [66]:
indici = dict()
for index, key in enumerate(voc):
    if key not in indici:
        indici[key] = index
        
reverse_indici = dict()
for term, termID in indici.items():
    reverse_indici[termID] = term

In [69]:
#costruzione matrice
graph_matrix = np.zeros(shape=(len(voc), len(voc)), dtype=np.float32)

for index1, key1 in enumerate(voc):
    for index2, key2 in enumerate(voc):
        graph_matrix[index1 ][index2] = w2v_model.wv.similarity(key1, key2)

In [70]:
#creazione lista di tuple

In [71]:
type(graph_matrix)

numpy.ndarray

In [39]:
#saving the matrix (binary data)
np.save('graph_matrix.npy', graph_matrix)

In [40]:
#Human readable data
np.savetxt('graph_matrix.txt', graph_matrix)

In [41]:
#loading the matrix
a = np.load('graph_matrix.npy')

In [42]:
len(graph_matrix)

2612

In [72]:
#constructing the list of tuples
graph_list = []
nodes = set() # <-- NUOVO
for x in range(len(graph_matrix)):
    for y in range(len(graph_matrix)):
        if graph_matrix[x][y] >= 0.7 and x != y:
            nodes |= set([reverse_indici[x], reverse_indici[y]])
            graph_list.append((reverse_indici[x],reverse_indici[y],graph_matrix[x][y]))       

In [73]:
# NUOVO
graph = dict()
graph["nodes"] = [{"id": t} for t in nodes]
graph["links"] = [{"source":t[0], "target":t[1], "value":float(t[2])} for t in graph_list]
with open('data.json', 'w') as outfile:
    json.dump(graph, outfile)

## FastText

In [None]:
from gensim.models import FastText
fast_model = FastText(tok_sents,size=100, window=10, min_count=5, sg=1, iter=20, negative=10)

In [None]:
fast_model.wv.most_similar(['apple'])

In [None]:
fast_model.wv.most_similar(['king'])

In [None]:
fast_model.wv.most_similar(['mirror'])

In [None]:
fasttext_wAp_analogy = fast_model.wv.evaluate_word_analogies(test_file, dummy4unknown=True)

# google precomputed word embeddings

In [None]:
import gensim
google_w2v_file = 'GoogleNews-vectors-negative300.bin'
google_model = gensim.models.KeyedVectors.load_word2vec_format(google_w2v_file,binary=True)

In [None]:
google_model.most_similar(['king','female'],['male'])

In [None]:
w2v_large_analogy = google_model.wv.accuracy(test_file)