Mestrado em Modelagem Matematica da Informacao
----------------------------------------------
Disciplina: Modelagem e Mineracao de Dados
------------------------------------------

Master Program - Mathematical Modeling of Information
-----------------------------------------------------
Course: Data Mining and Modeling
--------------------------------

Professor: Renato Rocha Souza
-----------------------------  

In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import re
import os
import sys
import pathlib
import multiprocessing
import urllib.request
import zipfile
import lxml.etree
import networkx as nx
from random import shuffle

import gensim 
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

from sklearn.manifold import TSNE
from sklearn.cluster import AffinityPropagation, DBSCAN, AgglomerativeClustering, MiniBatchKMeans


%matplotlib inline

matplotlib.rcParams['figure.figsize'] = (20.0, 15.0)

In [2]:
import importlib
import logging
importlib.reload(logging)

logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')
#logging.root.setLevel(level=logging.INFO)
#logger = logging.getLogger()
#logger = logging.getLogger(program)
#logger.setLevel(logging.DEBUG)

In [3]:
# https://towardsdatascience.com/word-embedding-with-word2vec-and-fasttext-a209c1d3e12c
# https://github.com/3Top/word2vec-api#where-to-get-a-pretrained-models
# http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/

path_io_files = pathlib.Path('../datasets/Word2vec/')

### Loading Evaluation Tests

In [4]:
questions = path_io_files / 'questions-words.txt'

In [5]:
evals = open(questions).readlines()
num_sections = len([l for l in evals if l.startswith(':')])
print('total evaluation sentences: {} '.format(len(evals) - num_sections))

total evaluation sentences: 19544 


In [6]:
def w2v_model_accuracy(model):
    accuracy = model.accuracy(questions)
    sum_corr = len(accuracy[-1]['correct'])
    sum_incorr = len(accuracy[-1]['incorrect'])
    total = sum_corr + sum_incorr
    percent = lambda a: a / total * 100
    print('Total sentences: {}, Correct: {:.2f}%, Incorrect: {:.2f}%'.format(total, 
                                                                             percent(sum_corr), 
                                                                             percent(sum_incorr)))

In [7]:
def print_results(model):
    print('queen')
    for result in model.wv.most_similar("queen"):
        print(result)
    print()
    print('man')
    for result in model.wv.most_similar("man"):
        print(result)
    print()
    print('woman')    
    for result in model.wv.most_similar("woman"):
        print(result)
    print()
    print('frog')
    for result in model.wv.most_similar("frog"):
        print(result)
    print()
    print('awful')
    for result in model.wv.most_similar("awful"):
        print(result)
    print()
    print("breakfast cereal dinner lunch:")
    print(model.wv.doesnt_match("breakfast cereal dinner lunch".split()))
    print("captain onion starship alien:")
    print(model.wv.doesnt_match("captain onion starship alien".split()))
    print("father mother son daughter film:")
    print(model.wv.doesnt_match("father mother son daughter film".split()))
    print("france england germany berlin:")
    print(model.wv.doesnt_match("france england germany berlin".split()))
    print("woman", "girl")
    print(model.wv.similarity("woman", "girl"))
    print("woman", "man")
    print(model.wv.similarity("woman", "man"))

### TED Model

In [8]:
#download the data

#url = "https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip"
#urllib.request.urlretrieve(url, filename="ted_en-20160408.zip")

# extract subtitles
with zipfile.ZipFile('../datasets/Word2vec/ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()'))

In [9]:
# remove parenthesis 
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

# store as list of sentences
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
    
# store as list of lists of words
sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

In [10]:
model_ted = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=5, workers=4, sg=0)

In [11]:
print_results(model_ted)

queen
('chief', 0.7341812252998352)
('jones', 0.7341130971908569)
('king', 0.7312071919441223)
('president', 0.7141561508178711)
('dalai', 0.7064103484153748)
('maria', 0.6999576091766357)
('church', 0.6935442090034485)
('lord', 0.6926425099372864)
('christ', 0.6920947432518005)
('mary', 0.6916918754577637)

man
('woman', 0.8481717109680176)
('guy', 0.8205757141113281)
('lady', 0.7724880576133728)
('soldier', 0.7338567972183228)
('gentleman', 0.7279619574546814)
('boy', 0.7160205841064453)
('girl', 0.7013405561447144)
('poet', 0.6984959840774536)
('kid', 0.6874792575836182)
('king', 0.683418869972229)

woman
('man', 0.848171591758728)
('girl', 0.827579140663147)
('lady', 0.7960957288742065)
('boy', 0.7917556762695312)
('kid', 0.7614094614982605)
('child', 0.7317502498626709)
('soldier', 0.7312405109405518)
('guy', 0.7175164222717285)
('gentleman', 0.7001538872718811)
('person', 0.6926642656326294)

frog
('compound', 0.7530810832977295)
('dish', 0.7402394413948059)
('shoe', 0.7327032089

### Google News Model

In [12]:
googlenews = os.path.join(path_io_files,'GoogleNews-vectors-negative300.bin')

In [None]:
model_googlenews = gensim.models.KeyedVectors.load_word2vec_format(googlenews, binary=True)

In [None]:
w2v_model_accuracy(model_googlenews)

In [None]:
print_results(model_googlenews)

### Media Cloud Model

In [None]:
#mediacloud = os.path.join(path_io_files, 'MediaCloud_w2v')
mediacloud = os.path.join(path_io_files, 'MediaCloud_w2v_trigrams')

In [None]:
model_mediacloud = gensim.models.Word2Vec.load(mediacloud)

In [None]:
model_mediacloud.most_similar('fgv')

In [None]:
def build_neighbors(word, model, nviz=15):
    g = nx.Graph()
    g.add_node(word, {'color':'blue'})
    viz1 = model.most_similar(word, topn=nviz)
    g.add_weighted_edges_from([(word, v, w) for v,w in viz1 if w> 0.5] )
    for v in viz1:
        g.add_weighted_edges_from([(v[0], v2, w2) for v2,w2 in model.most_similar(v[0])])
    return g

In [None]:
word = 'andré_braz'
g = build_neighbors(word, model_mediacloud)
cols = ['r']*len(g.nodes()); cols[g.nodes().index(word)]='b'
pos = nx.spring_layout(g, iterations=100)
nx.draw_networkx(g,pos=pos, node_color=cols, node_size=1000, alpha=0.5, font_size=16)
#nx.draw_networkx_labels(g, pos,dict(zip(g.nodes(),g.nodes())))

In [None]:
print_results(model_mediacloud)

### Wikipedia Model

(You'll need at least 36GB RAM to process this file)  

In [None]:
# Now you can load only the trimmed model and forget the other files
model_wikipedia = gensim.models.Word2Vec.load(os.path.join(path_io_files,'model_wikimedia_w2v'))

In [None]:
model_wikipedia.wv.vocab["tee"].count

In [None]:
w2v_model_accuracy(model_wikipedia)

In [None]:
print_results(model_wikipedia)