In [None]:
import json
import string
import random
import re
import urllib.request
from nltk.stem import WordNetLemmatizer
import numpy as np
# Para leer y parsear el texto en HTML de wikipedia
import bs4 as bs
import nltk
# Descargar el diccionario
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sys
#import gradio as gr
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import text_to_word_sequence

import multiprocessing
from gensim.models import Word2Vec

In [None]:
!{sys.executable} -m pip install gradio --quiet

In [None]:
from gensim.models.callbacks import CallbackAny2Vec
# Durante el entrenamiento gensim por defecto no informa el "loss" en cada época
# Sobracargamos el callback para poder tener esta información
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [None]:
def perform_lemmatization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [None]:
def get_processed_text(document):
    # 1 - reduce el texto a mínuscula
    # 2 - quitar los simbolos de puntuacion
    # 3 - realiza la tokenización
    # 4 - realiza la lematización
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

In [None]:
punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


##TOPICS:
* HOT DOG
* FAST FOOD
* COCA COLA
* FOOD
* OBESITY
* OVERWEIGHT
* SEDENTARY LIFESTYLE

##WE ONLY SELECT 3 TOPICS RANDOMLY, IT CHANGES IN EACH EXECUTION

In [None]:
topics = ["https://en.wikipedia.org/wiki/Hot_dog","https://en.wikipedia.org/wiki/Fast_food","https://en.wikipedia.org/wiki/Coca-Cola","https://en.wikipedia.org/wiki/Food","https://en.wikipedia.org/wiki/Obesity","https://en.wikipedia.org/wiki/Overweight","https://en.wikipedia.org/wiki/Sedentary_lifestyle"]

full_article = ""

for i in topics:
  raw_html = urllib.request.urlopen(i)
  raw_html = raw_html.read()

  article_html = bs.BeautifulSoup(raw_html, 'lxml')

  article_paragraphs = article_html.find_all('p')

  article_text = ''

  for para in article_paragraphs:
      article_text += para.text

  article_text = article_text.lower()
  full_article = full_article + article_text

In [None]:
text = re.sub(r'\[[0-9]*\]', ' ', full_article)
text = re.sub(r'\s+', ' ', text)

In [None]:
corpus = nltk.sent_tokenize(text)
words = nltk.word_tokenize(text)

In [None]:
option = 0

In [None]:
corpus_1 = [s.split() for s in corpus]

corpus_2 = []
for t in corpus:
    corpus_2.append(text_to_word_sequence(t))

In [None]:
# Crearmos el modelo generador de vectoeres

w2v_model_1 = Word2Vec(
                     min_count=5,    # frecuencia mínima de palabra para incluirla en el vocabulario
                     window=2,       # cant de palabras antes y desp de la predicha
                     size=300,       # dimensionalidad de los vectores 
                     negative=20,    # cantidad de negative samples... 0 es no se usa
                     workers=1,      # si tienen más cores pueden cambiar este valor
                     sg=0)           # modelo 0:CBOW  1:skipgram


w2v_model_2 = Word2Vec(
                     min_count=5,    # frecuencia mínima de palabra para incluirla en el vocabulario
                     window=2,       # cant de palabras antes y desp de la predicha
                     size=300,       # dimensionalidad de los vectores 
                     negative=20,    # cantidad de negative samples... 0 es no se usa
                     workers=1,      # si tienen más cores pueden cambiar este valor
                     sg=0)           # modelo 0:CBOW  1:skipgram

In [None]:
w2v_model_1.build_vocab(corpus_1)
w2v_model_2.build_vocab(corpus_2)

In [None]:
print("Cantidad de docs en el corpus:", w2v_model_1.corpus_count)
print("Cantidad de docs en el corpus:", w2v_model_2.corpus_count)

Cantidad de docs en el corpus: 1424
Cantidad de docs en el corpus: 1424


In [None]:
# Entrenamos el modelo generador de vectores
# Utilizamos nuestro callback
w2v_model_1.train(corpus,
                 total_examples=w2v_model_1.corpus_count,
                 epochs=20,
                 compute_loss = True,
                 callbacks=[callback()]
                 )

w2v_model_2.train(corpus,
                 total_examples=w2v_model_2.corpus_count,
                 epochs=20,
                 compute_loss = True,
                 callbacks=[callback()]
                 )

Loss after epoch 0: 14816.3486328125
Loss after epoch 1: 6278.2314453125
Loss after epoch 2: 6865.732421875
Loss after epoch 3: 6166.09375
Loss after epoch 4: 6537.875
Loss after epoch 5: 6025.203125
Loss after epoch 6: 6166.60546875
Loss after epoch 7: 6086.3671875
Loss after epoch 8: 6051.05078125
Loss after epoch 9: 5993.875
Loss after epoch 10: 6064.4296875
Loss after epoch 11: 6259.40625
Loss after epoch 12: 6250.5
Loss after epoch 13: 6435.40625
Loss after epoch 14: 6447.8125
Loss after epoch 15: 5837.9921875
Loss after epoch 16: 6436.6953125
Loss after epoch 17: 5890.8359375
Loss after epoch 18: 6133.9453125
Loss after epoch 19: 5695.953125
Loss after epoch 0: 55346.44140625
Loss after epoch 1: 35086.37890625
Loss after epoch 2: 33515.5703125
Loss after epoch 3: 32798.265625
Loss after epoch 4: 32816.03125
Loss after epoch 5: 32888.734375
Loss after epoch 6: 32213.734375
Loss after epoch 7: 32076.03125
Loss after epoch 8: 31907.6875
Loss after epoch 9: 31624.78125
Loss after epo

(1557806, 4092780)

##APARCIÓN DEL BMI, BODY MASS INDEX

In [None]:
word_p = ["healthy"]

In [None]:
w2v_model_1.wv.most_similar(positive=word_p,topn=10)

[('well', 0.14870218932628632),
 ('located', 0.14741304516792297),
 ('bmi,', 0.14218726754188538),
 ('includes', 0.14185893535614014),
 ('versions', 0.1393348127603531),
 ('or', 0.13589119911193848),
 ('account', 0.13399715721607208),
 ('consumption', 0.1325126439332962),
 ('worldwide', 0.13174983859062195),
 ('"new', 0.13106416165828705)]

In [None]:
w2v_model_2.wv.most_similar(positive=word_p,topn=10)

[('spending', 0.1551455855369568),
 ('well', 0.14870218932628632),
 ('located', 0.14741304516792297),
 ('includes', 0.14185893535614014),
 ('versions', 0.1393348127603531),
 ('2005', 0.138925239443779),
 ('or', 0.13589119911193848),
 ('account', 0.13399715721607208),
 ('consumption', 0.1325126439332962),
 ('worldwide', 0.13174983859062195)]